Пример #1
0
class TestSimhashIndex(TestCase):
    data = {
        1:
        u'How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks.',
        2:
        u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than',
        3: u'This is a different one.',
    }

    def setUp(self):
        objs = [(str(k), Simhash(v)) for k, v in self.data.items()]
        self.index = SimhashIndex(objs)

    def test_get_near_dup(self):
        s1 = Simhash(
            u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank'
        )
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 2)

        self.index.delete('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 1)

        self.index.delete('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 1)

        self.index.add('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 2)

        self.index.add('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 2)
Пример #2
0
class TestSimhashIndex(TestCase):
    data = {
        1: 'How are you? I Am fine. blar blar blar blar blar Thanks.',
        2: 'How are you i am fine. blar blar blar blar blar than',
        3: 'This is simhash test.',
        4: 'How are you i am fine. blar blar blar blar blar thank1',
    }

    def setUp(self):
        objs = [(str(k), Simhash(v)) for k, v in self.data.items()]
        self.index = SimhashIndex(objs, k=10)

    def test_get_near_dup(self):
        s1 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank')
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 3)

        self.index.delete('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 2)

        self.index.delete('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 2)

        self.index.add('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 3)

        self.index.add('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 3)
Пример #3
0
class TestSimhashIndex(TestCase):
    data = {
        1: 'How are you? I Am fine. blar blar blar blar blar Thanks.',
        2: 'How are you i am fine. blar blar blar blar blar than',
        3: 'This is simhash test.',
        4: 'How are you i am fine. blar blar blar blar blar thank1',
    }

    def setUp(self):
        objs = [(str(k), Simhash(v)) for k, v in self.data.items()]
        self.index = SimhashIndex(objs, k=10)

    def test_get_near_dup(self):
        s1 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank')
        dups = self.index.get_near_dups(s1)
        self.assertEqual(3, len(dups))

        self.index.delete('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(2, len(dups))

        self.index.delete('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(2, len(dups))

        self.index.add('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(3, len(dups))

        self.index.add('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(3, len(dups))
Пример #4
0
def extract_next_links(url, resp) -> "list()":
    defrag=urldefrag(url)[0]
    print(defrag)
    if resp.status == 200:
        print("Scanning")
        if defrag not in urls:
            content = resp.raw_response.text
            data=getVisibleText(content)
            simmed=Simhash(data)
            if simmed.value not in sims:
                index=SimhashIndex(objs,k=3)
                if len(index.get_near_dups(simmed))==0:
                    urls.add(defrag)
                    sims.add(simmed.value)
                    objs.append((url,simmed))
                    print(len(urls),len(sims),len(objs))
                    try:
                        file=open("data_dump.txt","a",errors="ignore")
                        to_write=url+ " \n "+ data+ "\n"+ str(simmed.value) +"\n\n"
                        file.write(to_write)
                    except Exception as e:
                        raise e
                    finally:
                        file.close()
            #urls[defrag].add(getVisibleText(content))
            #print(urls[defrag])
        return getAllUrls(url,content)
    else:
        print("Cant scan")
        return []
Пример #5
0
def use_simhash_index():
    data = {
        1: "How are you? I Am fine. blar blar blar blar blar Thanks.",
        2: "How are you i am fine. blar blar blar blar blar than",
        3: "This is simhash test.",
    }
    objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
    index = SimhashIndex(objs, k=3)
    
    print(index.bucket_size())
    
    s1 = Simhash(get_features(u"How are you i am fine. blar blar blar blar blar thank"))
    print(index.get_near_dups(s1))
    
    index.add("4", s1)
    print(index.get_near_dups(s1))
Пример #6
0
def use_simhash_index():
    data = {
        1: "How are you? I Am fine. blar blar blar blar blar Thanks.",
        2: "How are you i am fine. blar blar blar blar blar than",
        3: "This is simhash test.",
    }
    objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
    index = SimhashIndex(objs, k=3)

    print(index.bucket_size())

    s1 = Simhash(
        get_features(u"How are you i am fine. blar blar blar blar blar thank"))
    print(index.get_near_dups(s1))

    index.add("4", s1)
    print(index.get_near_dups(s1))
Пример #7
0
def simhash_test():
    data = {
        1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
        2: u'How are you i am fine. blar blar blar blar blar than',
        3: u'This is simhash test.',
    }
    for k, v in data.items(): print k, get_phrases(v)
    for k, v in data.items(): print k, Simhash(get_phrases(v)).value

    objs = [(str(k), Simhash(get_phrases(v))) for k, v in data.items()]
    index = SimhashIndex(objs, k=3)

    print index.bucket_size()

    s1 = Simhash(get_phrases(u'How are you i am fine. blar blar blar blar blar thank'))
    print index.get_near_dups(s1)

    index.add('4', s1)
    print index.get_near_dups(s1)
Пример #8
0
def clustering():
    fout = open('cluster.txt', 'w', encoding='UTF-8')
    cursor = conn.cursor()
    cursor.execute(
        'SELECT id, title, cluster, sim_count, link, simhash FROM entries where cluster=0'
    )
    entrylist = cursor.fetchall()
    objs = []
    entrydic = {}
    for item in entrylist:
        if not is_en(item[1]):
            if not item[4].startswith("https://weibo.com"):
                sim = Simhash(get_features_cn(item[1]))
                objs.append((str(item[0]), sim))
                entrydic[str(item[0])] = {
                    'title': item[1],
                    'cluster': 0,
                    'sim_count': 0,
                    'link': item[4],
                    'simhash': sim.value
                }
        else:
            sim = Simhash(get_features(item[1]))
            objs.append((str(item[0]), sim))
            entrydic[str(item[0])] = {
                'title': item[1],
                'cluster': 0,
                'sim_count': 0,
                'link': item[4],
                'simhash': sim.value
            }

    index = SimhashIndex(objs, k=tolerance)
    cluster_num = last_cluster_num
    for key in entrydic:
        if entrydic[key]['cluster'] == 0:
            sims = index.get_near_dups(
                Simhash(get_features_cn(entrydic[key]['title'])))
            for item in sims:
                entrydic[item]['cluster'] = cluster_num
                # if len(sims) > 1:
                entrydic[item]['sim_count'] = len(sims) - 1
                if len(sims) > 1:
                    fout.write(item + '\t' + str(entrydic[item]['cluster']) +
                               '\t' + entrydic[item]['title'] + '\n')
                cursor.execute(
                    'UPDATE entries SET cluster=%s, sim_count=%s, simhash=%s where id = %s',
                    (entrydic[item]['cluster'], entrydic[item]['sim_count'],
                     str(entrydic[item]['simhash']), item))
                # conn.commit()
                # fout.write(item + '\t' + str(entrydic[item]['cluster']) + '\t' + entrydic[item]['title'] + '\t' + entrydic[item]['link'] + '\n')
            cluster_num += 1
    # cursor.execute('UPDATE somevariables SET last_cluster=%s', (cluster_num,))
    # conn.commit()
    conn.close()
Пример #9
0
def main():
    # user_query = input()
    DOCID = 0


    numPartial = 1 

    index = SimhashIndex([])

    totaldocs = 0
    docnum = 0

    validDocFile = open('validDocs2', 'w')

    for root, dirs, files in os.walk(DEVPATH):
        for fname in files:
            if not fname.endswith(".json"):
                continue
            totaldocs += 1
            h2t = html2text.HTML2Text()

            file = open(root + "/" + fname)

            pageDict = json.loads(file.read())

            # close file to get memory back
            file.close()

            # get html formated content
            htmlContent = pageDict['content']

            print(pageDict['url'])

            plainContent = h2t.handle(htmlContent)

            feat = get_features(plainContent)

            sim = Simhash(feat)

            if len(index.get_near_dups(sim)) > 0:
                continue

            print(docnum, totaldocs)

            index.add(str(docnum), sim)

            validDocFile.write(root + "/" + fname + "\n")

            docnum+=1


    validDocFile.close()
Пример #10
0
class SpellingCorrector(object):
  def __init__(self, vocab_to_freq, f=64, k=32):
    self.vocab_to_freq = vocab_to_freq
    self.simhash_index = SimhashIndex([], f=f, k=k)
    self.f = f
    self.k = k
    
    simhash_index = self.simhash_index
    for w in vocab_to_freq:
      sh = Simhash(w, f=f)
      simhash_index.add(w, sh)
  
  def add_valid_word(self, word):
    if word not in self.vocab_to_freq:
      sh = Simhash(word, self.f)
      self.simhash_index.add(word, sh)
    self.vocab_to_freq[word] = self.vocab_to_freq.get(word, 0) + 1
    
  def correct_word(self, word):
    
    if word in self.vocab_to_freq:
      return word
    
    #Edit distance between
    sh = Simhash(word, f=self.f)
    candidates = self.simhash_index.get_near_dups(sh)
    
    if not candidates:
      #No near dups. Oh well. This word will go as it is.
      print 'no candidates'
      return word
    
    if len(candidates) == 1:
      #Only one candidate, so assume this is the correction
      return candidates[0]
      
    lev_dist_gen = ((other_w, levenshtein(other_w, word)) for other_w in candidates)
    closest_words, dists = zip(*all_min_or_max(lev_dist_gen, min, lambda item: item[1]))
    
    if len(closest_words) == 1:
      #One of the candidates had the best edit distance. Return that.
      return closest_words[0]
    
    #OK, there are multiple closest words. Rely on word frequency to choose the right one.
    vocab_to_freq = self.vocab_to_freq
    word_freq_gen = ((other_w, vocab_to_freq[other_w]) for other_w in closest_words)
    most_freq_words, freqs = zip(*all_min_or_max(word_freq_gen, max, lambda item: item[1]))
    
    #using choice because at this point there's no other way to narrow it down, unless we
    #track higher order ngrams.
    return choice(most_freq_words)
Пример #11
0
def get_near_dups(query_simhash, candidates_simhash, k):
    res = [0] * len(candidates_simhash)
    query = Simhash(value=query_simhash)

    for i in range(len(candidates_simhash)):
        candidates_simhash[i] = (str(i), Simhash(value=candidates_simhash[i]))
        i = i + 1
    index = SimhashIndex(candidates_simhash, k=k)
    near_dups = index.get_near_dups(query)

    for dup in near_dups:
        res[int(dup)] = 1

    return res
Пример #12
0
def sim_merge(finaldb_cut, simdb):
    d = {}
    index_list = []
    hashurl2sim = {}
    max_distance = 10
    with open(finaldb_cut, 'r') as f:
        for line in f:
            if not line:
                break
            # hashurl  title  author  images  links  text  pub_time
            # 1        2      3       4       5      6     7
            # jushi  shouji  zujin  dizhi  ditie  url  crawl_time  source  ext
            # 8      9       10     11     12     13   14          15      16
            array = line.rstrip('\r\n').split('\t')
            hashurl=array[0]     #string,key
            title=array[1]       #string
            text=array[5]        #string
            pub_time=array[6]    #string 
            url=array[12]        #string 

            s = Simhash((title+text).decode('utf-8'))
            d.update({
                hashurl:(title, url, pub_time)
            })
            sim = Simhash((title+text).decode('utf-8'))
            index_list.append((hashurl, sim))
            hashurl2sim.update({hashurl:sim})

    index = SimhashIndex(index_list, k=max_distance)
    merged = {}
    while d:
        hashurl, (title, url, pub_time) = d.popitem()
        merged[hashurl] = (title, url, pub_time)
        sim_list = index.get_near_dups(hashurl2sim[hashurl])
        buf_list = []
        for h in sim_list:
            if h != hashurl:
                if d.has_key(h):
                    title2, url2, pub_time2 = d.pop(h)
                    merged[h] = (title2, url2, pub_time2)
                else:
                    title2, url2, pub_time2 = merged[h]
            else:
                title2, url2, pub_time2 = title, url, pub_time
            buf_list.append((h, title2, url2, pub_time2))
        if len(buf_list) > 1:
            buf_list = sorted(buf_list, key=lambda i:i[3], reverse=True)
            simdb.insert('\t'.join(
                [buf_list[0][0], json.dumps(buf_list[1:])]
            ))
Пример #13
0
def find_near_matches(session, collection, index_size, probability_index_near_match):
    from simhash import Simhash, SimhashIndex
    logging.getLogger().setLevel(logging.CRITICAL)

    tweet_id_simhash_value = session.execute(
        sa.select([model.Tweet.tweet_id, model.Tweet.features['filter','simhash']])
        .where(model.Tweet.collection == collection)
    )

    simhash_index = SimhashIndex([], k=7)

    insert_relation_stmt = pg.insert(model.relation)
    # insert_tweet_near_matches_stmt = insert_tweet_near_matches_stmt.on_conflict_do_update(
    #     index_elements=['tweet_id', 'collection'],
    #     set_={
    #         'earliest_near_match_id': insert_tweet_near_matches_stmt.excluded.earliest_near_match_id
    #     }
    # )

    indexed_tweet_ids = []

    for i, (tweet_id, simhash_value) in enumerate(tweet_id_simhash_value):

        if (i % 100000) == 1000:
            logger.info('Processed %s tweets. Committing.', i)
            session.commit()

        simhash = Simhash(simhash_value)

        near_matches_ids = simhash_index.get_near_dups(simhash)

        if not near_matches_ids:
            simhash_index.add(tweet_id, simhash)
            indexed_tweet_ids.append((tweet_id, simhash))

            if len(indexed_tweet_ids) > index_size:
                simhash_index.delete(*indexed_tweet_ids.pop(0))

        if near_matches_ids:
            near_match_id = min(near_matches_ids)

            logger.debug('A near match %s for tweet %s', near_match_id, tweet_id)
            session.execute(
                insert_relation_stmt.values(
                    [(tweet_id, collection, 'near_match', near_match_id)]
                )
            )

    session.commit()
Пример #14
0
def console_test():
    from simhash import Simhash, SimhashIndex
    data = {
        1: 'How are you? I Am fine. blar blar blar blar blar Thanks.',
        2: 'How are you i am fine. blar blar blar blar blar than',
        3: 'This is simhash test.',
        4: 'How are you i am fine. blar blar blar blar blar thank1',
    }
    objs = [(str(k), Simhash(v)) for k, v in data.items()]
    index = SimhashIndex(objs, k=10)
    s1 = Simhash(
        u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank'
    )
    dups = index.get_near_dups(s1)
    dups = index.get_near_dups2(s1, 5)
    index.remove(s1)
Пример #15
0
def simhash_clustering(
    signatures: List[int],
    hamming_distance: int = 3,
    # num_blocks: Optional[int] = 5,
) -> List[List[int]]:

    index = SimhashIndex([(i, Simhash(value=signature))
                          for i, signature in enumerate(signatures)],
                         k=hamming_distance)

    neighbors: List[List[int]] = []
    for signature in signatures:
        neighbors.append(
            list(map(int, index.get_near_dups(Simhash(value=signature)))))

    return neighbors
Пример #16
0
    def process_graph(self, project_id):
        visits = defaultdict(list)
        p = 0
        hashtags_db = Hashtag.objects.filter(project_id=project_id)

        logger.info("Total hashtags to process " + str(len(hashtags_db)))
        for hashtag_entry in hashtags_db:
            visits[hashtag_entry.user_id].append(hashtag_entry.hashtag)
            p += 1

        logger.info("Hashtag read")
        logger.info("Hashtag processed " + str(p))
        logger.info("Visits count " + str(len(visits)))

        objs = []
        cant_users = 0
        cant_processed = 0
        index = SimhashIndex(objs, f=f1, k=k1)
        for user, hashtags in visits.iteritems():
            if len(hashtags) > MIN_HASHTAG_PER_USER:
                simhash = Simhash(hashtags, f=f1)
                index.add(user, simhash)
                cant_processed += 1
            cant_users += 1
            if cant_users % 10000 == 0:
                logger.info("%s processed" % cant_users)

        logger.info("Simash index build for %i out of %i users" %
                    (cant_processed, len(visits)))
        cant_processed = 0
        for user, hashtags in visits.iteritems():
            near_dups = index.get_near_dups(Simhash(hashtags, f=f1))
            for user_near_dups in near_dups:
                user_near_dups = long(user_near_dups)
                if user_near_dups != long(user):
                    hashtag_near_dups = visits[user_near_dups]
                    intersect = set(hashtags).intersection(hashtag_near_dups)
                    ratio = len(intersect) * 1.0 / len(hashtag_near_dups)
                    if ratio >= 0.1:
                        hashtag_graph = HashtagGraph(user_oid_i=user,
                                                     user_oid_j=user_near_dups,
                                                     ratio=ratio)
                        hashtag_graph.save()
            cant_processed += 1
            if cant_processed % 10000 == 0:
                logger.info("%i processed" % cant_processed)
Пример #17
0
def sim_merge(finaldb_cut, simdb):
    d = {}
    index_list = []
    hashurl2sim = {}
    max_distance = 10
    with open(finaldb_cut, 'r') as f:
        for line in f:
            if not line:
                break
            # hashurl  title  author  images  links  text  pub_time
            # 1        2      3       4       5      6     7
            # jushi  shouji  zujin  dizhi  ditie  url  crawl_time  source  ext
            # 8      9       10     11     12     13   14          15      16
            array = line.rstrip('\r\n').split('\t')
            hashurl = array[0]  #string,key
            title = array[1]  #string
            text = array[5]  #string
            pub_time = array[6]  #string
            url = array[12]  #string

            s = Simhash((title + text).decode('utf-8'))
            d.update({hashurl: (title, url, pub_time)})
            sim = Simhash((title + text).decode('utf-8'))
            index_list.append((hashurl, sim))
            hashurl2sim.update({hashurl: sim})

    index = SimhashIndex(index_list, k=max_distance)
    merged = {}
    while d:
        hashurl, (title, url, pub_time) = d.popitem()
        merged[hashurl] = (title, url, pub_time)
        sim_list = index.get_near_dups(hashurl2sim[hashurl])
        buf_list = []
        for h in sim_list:
            if h != hashurl:
                if d.has_key(h):
                    title2, url2, pub_time2 = d.pop(h)
                    merged[h] = (title2, url2, pub_time2)
                else:
                    title2, url2, pub_time2 = merged[h]
            else:
                title2, url2, pub_time2 = title, url, pub_time
            buf_list.append((h, title2, url2, pub_time2))
        if len(buf_list) > 1:
            buf_list = sorted(buf_list, key=lambda i: i[3], reverse=True)
            simdb.insert('\t'.join([buf_list[0][0], json.dumps(buf_list[1:])]))
Пример #18
0
    def process_graph(self, project_id):
        visits = defaultdict(list)
        processed = 0
        urls_db = Urls.objects.filter(project_id=project_id)

        logger.info("Total urls to process " + str(len(urls_db)))
        for url_entry in urls_db:
            visits[url_entry.user_id].append(url_entry.url)
            processed += 1
        logger.info("Urls read")
        logger.info("Urls processed " + str(processed))
        logger.info("Visits count " + str(len(visits)))

        objs = []
        cant_users = 0
        cant_processed = 0
        index = SimhashIndex(objs, f=f1, k=k1)
        for user, urls in visits.iteritems():
            if len(urls) > MIN_URLS_PER_USER:
                simhash = Simhash(urls, f=f1)
                index.add(user, simhash)
                cant_processed += 1
            cant_users += 1
            if cant_users % 10000 == 0:
                logger.info("%s processed" % cant_users)

        logger.info("Simash index build for %i out of %i users" %
                    (cant_processed, len(visits)))
        cant_processed = 0
        for user, urls in visits.iteritems():
            near_dups = index.get_near_dups(Simhash(urls, f=f1))
            for user_near_dups in near_dups:
                user_near_dups = long(user_near_dups)
                if user_near_dups != long(user):
                    urls_near_dups = visits[user_near_dups]
                    intersect = set(urls).intersection(urls_near_dups)
                    ratio = len(intersect) * 1.0 / len(urls_near_dups)
                    if ratio >= 0.1:
                        url_graph = UrlsGraph(user_oid_i=user,
                                              user_oid_j=user_near_dups,
                                              ratio=ratio)
                        url_graph.save()
            cant_processed += 1
            if cant_processed % 10000 == 0:
                logger.info("%i processed" % cant_processed)
Пример #19
0
class TestSimhashIndex(TestCase):
    def setUp(self):
        data = {
            1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
            2: u'How are you i am fine. blar blar blar blar blar than',
            3: u'This is simhash test.',
        }
        objs = [(str(k), Simhash(v)) for k, v in data.items()]
        self.index = SimhashIndex(objs)

    def test_bucket_size(self):
        self.assertEqual(self.index.bucket_size(), 6)

    def test_get_near_dup(self):
        s1 = Simhash(u'How are you i am fine. blar blar blar blar blar thank')
        dups = self.index.get_near_dups(s1)

        self.assertEqual(len(dups), 2)
Пример #20
0
def simhashsort(datadic, entryset):
    objs = [(id, Simhash(sent)) for id, sent in datadic.items()]
    index = SimhashIndex(objs, k = tolerance)  # k是容忍度;k越大,检索出的相似文本就越多
    kind = 1  # 类型号
    sorted = set()
    for id in datadic:
        if str(id) in sorted:  # 不重复分类
            continue
        # 求相似集
        similiarlist = index.get_near_dups(Simhash(datadic[id]))
        similiarlist.append(str(id))
        # 将相似集信息返回到entryset中
        for id in similiarlist:
            sorted.add(id)
        for entry in entryset:
            if str(entry["id"]) in similiarlist:
                entry["cluster"] = kind
        kind += 1
Пример #21
0
    def process_graph(self, project_id):
        visits = defaultdict(list)
        p = 0;
        hashtags_db = Hashtag.objects.filter(project_id=project_id)

        logger.info("Total hashtags to process "+str(len(hashtags_db)))
        for hashtag_entry in hashtags_db:
            visits[hashtag_entry.user_id].append(hashtag_entry.hashtag)
            p +=1

        logger.info("Hashtag read")
        logger.info("Hashtag processed " + str(p))
        logger.info("Visits count " + str(len(visits)))

        objs = []
        cant_users = 0
        cant_processed = 0
        index = SimhashIndex(objs, f=f1, k=k1)
        for user, hashtags in visits.iteritems():
            if len(hashtags) > MIN_HASHTAG_PER_USER:
                simhash = Simhash(hashtags, f=f1)
                index.add(user, simhash)
                cant_processed += 1
            cant_users += 1
            if cant_users % 10000 == 0:
                logger.info("%s processed" % cant_users)

        logger.info("Simash index build for %i out of %i users" % (cant_processed, len(visits)))
        cant_processed = 0
        for user, hashtags in visits.iteritems():
            near_dups = index.get_near_dups(Simhash(hashtags, f=f1))
            for user_near_dups in near_dups:
                user_near_dups = long(user_near_dups)
                if user_near_dups != long(user):
                    hashtag_near_dups = visits[user_near_dups]
                    intersect = set(hashtags).intersection(hashtag_near_dups)
                    ratio = len(intersect)*1.0/len(hashtag_near_dups)
                    if ratio >= 0.1:
                        hashtag_graph = HashtagGraph(user_oid_i=user, user_oid_j=user_near_dups, ratio=ratio)
                        hashtag_graph.save()
            cant_processed += 1
            if cant_processed % 10000 == 0:
                    logger.info("%i processed" % cant_processed)
Пример #22
0
def save_duplicates(save_path, text2hash_dict, k=5):
    """Group similar docs' title"""
    # Construct SimhashIndex object for similar docs detection. k is tolerance.
    index = SimhashIndex(text2hash_dict, k=k)

    done = list()
    with tqdm(total=len(text2hash_dict)) as pbar:
        with open(save_path, 'w', encoding='utf8') as file:
            for i in range(len(text2hash_dict) - 1):
                # get near duplicates
                near_dups = index.get_near_dups(text2hash_dict[i][1])
                # near dups includes origin title, len > 1 requested
                if len(near_dups) > 1 and text2hash_dict[i][0] not in done:
                    for title in near_dups:
                        file.write(title)
                        file.write('\n')
                    file.write('#' * 5 + '\n')
                    done.extend(near_dups)
                pbar.update()
def simhashSort2(datadic, entryset):
    objs = []
    for entry in datadic:
        objs.append((entry[0], Simhash(entry[1])))
    index = SimhashIndex(objs, k=tolerance)  # k是容忍度;k越大,检索出的相似文本就越多
    kind = 1  # 类型号
    sorted = set()
    for item in datadic:
        if str(item[0]) in sorted:  # 不重复分类
            continue
        # 求相似集
        similiarlist = index.get_near_dups(Simhash(item[1]))
        similiarlist.append(str(item[1]))
        # 将相似集信息返回到entryset中
        for id in similiarlist:
            sorted.add(id)
        for entry in entryset:
            if str(entry["id"]) in similiarlist:
                entry["sim_count"] = kind
        kind += 1
def simhash_1(labels, targets, query, query_url, dataset, k=2, width=5):
    dictionary = dict(zip(labels, targets))
    objs = [(str(k), Simhash(get_features(v, width)))
            for k, v in dictionary.items()]
    index = SimhashIndex(objs, k=k)
    query_simhash = Simhash(get_features(query, width))
    near_dups = index.get_near_dups(query_simhash)

    # Save fingerprints for future use
    appendToFingerprints(
        dataset, './dataset/fingerprints.csv', {
            "query": str(query_simhash.value),
            "duplicates": ' '.join([str(obj[1].value) for obj in objs])
        })
    # print("QUERY: {}".format(query_url))
    # pp(near_dups)

    return {
        "dataset": dataset,
        "query": query_url,
        "duplicates": ' '.join(near_dups)
    }
Пример #25
0
def test(n):
    import time
    import distance
    from simhash import Simhash, SimhashIndex

    WIDTH = 3

    def gg():
        import random
        from random import randint
        from simhash import Simhash, SimhashIndex
        from itertools import groupby
        # text = str(bin(randint(2**63, 2**64-1)))[2:]
        # tokens = [text[i:i + WIDTH] for i in range(max(len(text) - WIDTH + 1, 1))]
        # return text, Simhash({k: sum(1 for _ in g) for k, g in groupby(sorted(tokens))})
        text = ''.join([random.choice('0123456789abcdef') for _ in range(36)])
        return text, Simhash(text)

    hashes = [gg() for _ in range(n)]
    d1, d2 = [], []
    test_string, test_hash = gg()

    start = time.time()
    for s, h in hashes:
        d1.append([distance.hamming(test_string, s), s])
    print time.time() - start

    start = time.time()
    index = SimhashIndex(hashes, k=5)
    for st in index.get_near_dups(test_hash):
        d2.append([distance.hamming(test_string, st), st])
    print time.time() - start

    print len(d1), len(d2)

    for a, b in zip(sorted(d1)[:20], sorted(d2)):
        print a[1] == b[1], '\t', a, '\t', b
Пример #26
0
def test(n):
    import time
    import distance
    from simhash import Simhash, SimhashIndex

    WIDTH = 3

    def gg():
        import random
        from random import randint
        from simhash import Simhash, SimhashIndex
        from itertools import groupby
        # text = str(bin(randint(2**63, 2**64-1)))[2:]
        # tokens = [text[i:i + WIDTH] for i in range(max(len(text) - WIDTH + 1, 1))]
        # return text, Simhash({k: sum(1 for _ in g) for k, g in groupby(sorted(tokens))})
        text = ''.join([random.choice('0123456789abcdef') for _ in range(36)])
        return text, Simhash(text)

    hashes = [gg() for _ in range(n)]
    d1, d2 = [], []
    test_string, test_hash = gg()

    start = time.time()
    for s, h in hashes:
        d1.append([distance.hamming(test_string, s), s])
    print time.time() - start

    start = time.time()
    index = SimhashIndex(hashes, k=5)
    for st in index.get_near_dups(test_hash):
        d2.append([distance.hamming(test_string, st), st])
    print time.time() - start

    print len(d1), len(d2)

    for a, b in zip(sorted(d1)[:20], sorted(d2)):
        print a[1] == b[1], '\t', a, '\t', b
Пример #27
0
keys = fourgram.keys()
f1 = open('rezFinalNoDuplicates.txt', 'w')
objs = []
for k in fourgram:
    try:
        objs.append((k, Simhash(fourgram[k])))
    except Exception as e:
        print e
#objs = [(k, Simhash(fourgram[k])) for k in fourgram]
index = SimhashIndex(objs, k=3)

print "bucket_size", index.bucket_size()

for key in keys:
    s1 = Simhash(fourgram[key])
    duplicates = ", ".join(index.get_near_dups(s1))
    f1.write(key + "\t" + duplicates+"\n")
    print key, duplicates

'''
while len(keys) > 0:
    key = keys.pop()
    keysJ = list(keys)
    f1.write(key + '\t' + text[key])

    while len(keysJ) > 0:
        j = keysJ.pop()
        intersect = fourgram[key] & fourgram[j]
        #print "checking", text[j]
        #print "forgram ", fourgram[key], "fourgram", fourgram[j]
        #print "calculation", len(intersect) , len(fourgram[key]) / 2.0
Пример #28
0
import re
from simhash import Simhash, SimhashIndex


def get_features(s):
    width = 3
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]


data = {
    1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
    2: u'How are you i am fine. blar blar blar blar blar than',
    3: u'This is simhash test.',
}
objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
index = SimhashIndex(objs, k=3)

print(index.bucket_size())

s1 = Simhash(
    get_features(u'How are you i am fine. blar blar blar blar blar thank'))
print(index.get_near_dups(s1))

index.add('4', s1)
print(index.get_near_dups(s1))
from simhash import Simhash
print '%x' % Simhash('How are you? I am fine. Thanks.').value
print '%x' % Simhash('How are u? I am fine.     Thanks.').value
print '%x' % Simhash('How r you?I    am fine. Thanks.').value


from simhash import Simhash, SimhashIndex
data = {
	1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
	2: u'How are you i am fine. blar blar blar blar blar than',
	3: u'This is simhash test.',
}
objs = [(str(k), Simhash(v)) for k, v in data.items()]
index = SimhashIndex(objs)

print index.bucket_size()

s1 = Simhash(u'How are you i am fine. blar blar blar blar blar thank')
print index.get_near_dups(s1)

index.add('4', s1)
print index.get_near_dups(s1)

# build by features
print '%x' % Simhash('How').value
print '%x' % Simhash(['How']).value
print '%x' % Simhash(['How', 'are', 'you']).value


Пример #30
0
wrongos = list(set(match) & set(osArray))

print '---------------------------------'
print ' Wrong Operating System'
print '---------------------------------'
print len(wrongos)

#########################################################################################################################################

if (args.near.upper() == 'Y'):
    fuzzy = []
    for line in unknown:
        try:
            fuzz = Simhash(get_features(unicode(line)))
            num = index.get_near_dups(fuzz)
            if len(num) != 0:
                fuzzy.append(line)
        except:
            pass

    print '---------------------------------'
    print ' Total Fuzzy Near Matches'
    print '---------------------------------'
    print len(fuzzy)

#########################################################################################################################################

wrongpath = []

for line in unknown:
Пример #31
0
test_index = [(u[0], Simhash(u[0])) for u in urls]

# simhash_results_a.txt : k=20 (subset)
# simhash_results_b.txt

with open('testdata/solr_20150320/simhash_results_k10.txt', 'w') as f:
    f.write('')

start_time = time.time()

for index, (test_url, test_simhash) in enumerate(test_index):
    i_start_time = time.time()
    if index % 50 == 0:
        print 'completed {0} of {1}'.format(index, len(urls))

    duplicates = []

    for i in xrange(0, len(test_index), 300):
        index = SimhashIndex(test_index[i:i + 300], k=10)
        dupes = index.get_near_dups(test_simhash)

        if len(dupes) > 0:
            duplicates += dupes

    print '\t{0} takes {1}'.format(len(duplicates), time.time() - i_start_time)

    with open('testdata/solr_20150320/simhash_results_k10.txt', 'a') as f:
        f.write(json.dumps({test_url: duplicates}) + '\n')

print 'takes:', time.time() - start_time
Пример #32
0
		logger.info('{create_time:{$gte:%ld,$lt:%ld} }' %(lasttimestamp,curtimestamp) )
		status_count = weibocollection.find({'create_time':{'$gte':lasttimestamp,'$lt':curtimestamp} }).count()
		logger.info('status_count: %d' %status_count)
		if status_count < 10:
			connection.close();mylogger.close()
			sys.exit(0)
		stopwords = loadstopwords(stopwordsfilename)
		fdoc=open(docfile,'w');fcut=open(cutfile,'w')
		num=0;simnum=0;cutnum=0
		#simhash
		index = SimhashIndex({})
		for one in weibocollection.find({'create_time':{'$gte':lasttimestamp,'$lt':curtimestamp} }):
			weibo_id = str(one['_id'])
			weibo_text = one['data']['text'].strip()
			text_sh = Simhash(weibo_text )
			if len(index.get_near_dups(text_sh) ) == 0: #not find sim
				#cut
				text_seg = jieba.cut(weibo_text)
				text_result = list(set(text_seg) - stopwords)
				content = ' 1 '.join(text_result)
				if content != '':
					fdoc.write(weibo_id+'\t'+weibo_text.encode('utf-8')+'\n');fcut.write(content.encode('utf-8')+' 1\n')
					cutnum += 1
				simnum += 1
			num += 1
			index.add(num,text_sh)
	except pymongo.errors,e:
		logger.critical('mongo find error: %s' %e)
		sys.exit(-2)

	logger.info('simnum: %d ' %simnum);
Пример #33
0
class NearDuplicate:
    def __init__(self, filenames, k=2, metadata_dictionary=None):
        self.filenames = filenames
        self.simhash_index = None
        self.image_dictionary = {}
        self.metadata_dictionary = metadata_dictionary
        self.k = k
        # Need to store the image hashes in some fashion
        # Possibly cluster the hashes (k-means)

    def tika_metadata(self, filename):
        """Use the tika-py module to grab metadata for a file"""
        parsed = parser.from_file(filename)
        return parsed.get("metadata", {})

    def exifread_metadata(self, filename):
        """Use the exifread module to grab metadata for a file"""
        f = open(filename, 'rb')
        tags = exifread.process_file(f)
        return tags

    def generate_features_from_dict(self, filename):
        """ Use this function when we provide json metadata information from
            the tika java module"""

        # Find the metadata object from the json metadata file for the image_file named 'filename'
        metadata = self.metadata_dictionary.get(filename, {})

        # The tags or type of metadata we want
        feature_tags = [
            "Image Height", "Image Width", "File Size", "Content-Type",
            "Image Bytes", "File Name Suffix"
        ]

        # Create a feature array using these metadata values
        features = []

        feature_weight_dict = {
            "Image Height": 1,
            "Image Width": 1,
            "Files Size": 2,
            "Content-Type": 3,
            "Image Bytes": 6,
            "File Name Suffix": 2
        }

        # Grab the bytes of the entire file
        image_bytes = "NONE"
        try:
            image_bytes = open(filename, 'rb').read()
        except OSError:
            image_bytes = "NONE"

        # Get the central bytes
        image_bytes_str = unicode(str(image_bytes), 'utf-8', "ignore")
        byte_offset = len(image_bytes_str) // 4
        filename_suffix = filename[-10:]

        modified_metadata = {
            "Image Height": metadata.get("Image Height", "NONE"),
            "Image Width": metadata.get("Image Width", "NONE"),
            "File Size": metadata.get("File Size", "NONE"),
            "Content-Type": metadata.get("Content-Type", "NONE"),
            "Image Bytes": image_bytes_str[byte_offset:-byte_offset],
            "File Name Suffix": filename_suffix
        }

        # Create an array of (token, weight) tuples. These are our features and weights
        # to be used for the Simhash
        for (feature_tag,
             weight), (meta_tag,
                       meta_value) in zip(feature_weight_dict.items(),
                                          modified_metadata.items()):
            features.append((meta_tag + ":" + meta_value, weight))

        return features

    def generate_features(self, filename):
        """Given an image generate a feature vector"""
        """ 
            Since Tika-Py requires a server call (i.e. slower)
            Do native image metadata grabbing, and fallback on tika if the
            image can't be opened (i.e., it's an svg or gif)
        """
        im, use_tika = None, False
        try:
            im = Image.open(filename)
            use_tika = False
        except IOError:
            use_tika = True

        # Grab the metadata for the image
        metadata = {}

        # We'll store features to use for simhash in a tuple array [(token, weight)]
        features = []

        if use_tika:
            # Use only metadata from tika
            # The image file can't be opened using PIL.Image, so that means
            # a diff type of image besides jpg, png
            metadata = self.tika_metadata(filename)

            # Grab the bytes of the entire file
            image_bytes = open(filename).read()

            # Get the central bytes

            image_bytes_str = unicode(str(image_bytes), 'utf-8', "ignore")
            #image_bytes_str = str(image_bytes)
            byte_offset = len(image_bytes_str) // 4
            metadata["Image Bytes"] = image_bytes_str[byte_offset:-byte_offset]
            feature_tags = [
                "Image Height", "Image Width", "File Size", "Content-Type",
                "Image Bytes"
            ]
            features = [
                tag + ":" + metadata.get(tag, "NONE") for tag in feature_tags
            ]
            return features
        """ 
            FEATURES
                We'll resize the image so all images are normalized to a certain size 
                Also make sure to retain aspect ratio

                Features to use (in order of importance)
                    - center region bytes 
                    - color histogram
                    - content type
                    - image width
                    - image height

            We can take subregions of the image, and hash those
        """

        # Resize the image so all images are normalized
        width = im.size[0]
        height = im.size[1]
        resize_width = 30
        resize_height = resize_width * height / width
        resize_im = None
        histogram_bytes, histogram_weight = "", 0
        center_region_bytes, center_region_weight = "", 5
        extension = ""
        try:
            resize_im = im.resize((resize_width, resize_height),
                                  Image.ANTIALIAS)
            # Crop sub regions
            height_padding, width_padding = resize_height / 5, resize_width / 5
            box = (width_padding, height_padding, resize_width - width_padding,
                   resize_height - height_padding)
            sub_region = resize_im.crop(box)

            # Generate a histogram
            histogram_bytes, histogram_weight = str(resize_im.histogram()), 4
            center_region_bytes, center_region_weight = str(
                list(sub_region.getdata())), 3
        except OSError:

            # Couldn't resize the image. Let's
            print >> sys.stderr, "Couldn't resize the image. Prob an eps or svg"
            resize_im = im
            resize_width = im.size[0]
            resize_height = im.size[1]
            sub_region = im

            # Grab the bytes of the entire file
            image_bytes = open(filename).read()
            # Get the central bytes
            #image_bytes_str = str(image_bytes)
            histogram_bytes = "NONE"
            image_bytes_str = unicode(str(image_bytes), 'utf-8', "ignore")
            byte_offset = len(image_bytes_str) // 4
            center_region_bytes = image_bytes_str[byte_offset:-byte_offset]

        extension = resize_im.format if resize_im.format != None else os.path.splitext(
            filename)[1]

        # Figure out the content type (png, jpg, etc.)
        content_type = "image/" + str(extension.lower())

        feature_weight_dict = {
            "Image Height": 1,
            "Image Width": 1,
            "Image Histogram": histogram_weight,
            "Content-Type": 5,
            "Center Region Bytes": center_region_weight
        }

        metadata = {
            "Image Height": str(width),
            "Image Width": str(height),
            "Image Histogram": histogram_bytes,
            "Content-Type": content_type,
            "Center Region Bytes": center_region_bytes
        }

        # Create an array of (token, weight) tuples. These are our features and weights
        # to be used for the Simhash
        for (feature_tag,
             weight), (meta_tag,
                       meta_value) in zip(feature_weight_dict.items(),
                                          metadata.items()):
            features.append((meta_tag + ":" + meta_value, weight))

        return features

    def merge_near_duplicate_dictionaries(self, nd):
        """Merge the current near duplicate instance with another near duplicate instance"""

        smaller_nd = self if len(self.image_dictionary) <= len(
            nd.image_dictionary) else nd
        larger_nd = self if len(self.image_dictionary) > len(
            nd.image_dictionary) else nd
        final_dict = larger_nd.image_dictionary

        # Iterate over the smaller near duplicate instance
        for key in smaller_nd.image_dictionary.keys():

            # If an exact duplicate exists, just grab it and merge them
            if larger_nd.image_dictionary.get(key, None) != None:
                arr = smaller_nd.image_dictionary.get(key, []) +\
                        larger_nd.image_dictionary.get(key, [])
                final_dict[key] = arr
                continue

            # Find the closest near duplicate in the larger dictionary by
            # using it's index
            simhash_obj = smaller_nd.image_dictionary[key][0]["hash_object"]

            near_duplicates_keys = larger_nd.simhash_index.get_near_dups(
                simhash_obj)

            # If a near duplicate exists
            if len(near_duplicates_keys) > 0:
                # grab the array of images at that key in the larger dictionary
                # Merge it the array of images in the smaller dictionary
                near_dup_key = near_duplicates_keys[0]
                arr = smaller_nd.image_dictionary.get(key, []) +\
                        larger_nd.image_dictionary.get(near_dup_key, [])

                # create an entry in the new dictionary
                final_dict[near_dup_key] = arr
                continue

            # Otherwise we should just add this key-object from the dictionary
            # to this array
            final_dict[key] = smaller_nd.image_dictionary[key]

            # Add this simhash to the Index for efficient searching
            larger_nd.simhash_index.add(key, simhash_obj)

        self.image_dictionary = final_dict
        self.simhash_index = larger_nd.simhash_index

        nd.image_dicionary = final_dict
        nd.simhash_index = larger_nd.simhash_index

        # Now simply return this final dict
        return final_dict

    def simhash_value_to_key(self, simhash):
        """Given a simhash object, convert it's value to a hexadecimal key 
            This key will be used in our image_file dictionary
        """
        return str(hex(simhash.value))

    def deduplicate_images(self):
        """
            Given a list of image files "self.filenames", deduplicate the images using
            near deduplication 
        """
        # Iterate through our files
        for image_file in self.filenames:
            feature_array = []
            if self.metadata_dictionary != None:
                # Will use a java tika program to generate metadata
                # Metadata will be a json file with {filename : metadata} objects
                feature_array = self.generate_features_from_dict(image_file)
            else:
                # Use our own function for grabbing metadata
                # Create a list of features
                feature_array = self.generate_features(image_file)

            # Simhash this list of features
            sHash = Simhash(feature_array)
            if self.simhash_index == None:
                # First image, so we create the index add it to the dictionary
                # And move on to next iteration
                key = self.simhash_value_to_key(sHash)

                # We will use this index to speed up the process for finding
                # nearby simhashes
                self.simhash_index = SimhashIndex([(key, sHash)], k=self.k)
                self.image_dictionary[key] = [{
                    "filename": image_file,
                    "hash_key": key,
                    "hash_object": sHash
                }]
                continue

            near_duplicates_keys = self.simhash_index.get_near_dups(sHash)

            if len(near_duplicates_keys) > 0:
                # There are duplicates, so we should add them to the corresponding entry
                # in the file dictionary

                # Get the key for the nearest duplicate image
                near_dup_simhash_key = near_duplicates_keys[0]

                # Get the key for this current image
                current_simhash_key = self.simhash_value_to_key(sHash)

                # Create an object comprised of the image filename and key
                # We'll store this in a dictionary to be used in our merge step
                current_simhash_object = {
                    "filename": image_file,
                    "hash_key": current_simhash_key,
                    "hash_object": sHash
                }
                self.image_dictionary[near_dup_simhash_key].append(
                    current_simhash_object)
            else:
                # No duplicates, so let's create an entry in our image filename dictionary
                key = self.simhash_value_to_key(sHash)

                # Add this simhash to the Index for efficient searching
                self.simhash_index.add(key, sHash)

                # Create an object in our image file dictionary
                self.image_dictionary[key] = [{
                    "filename": image_file,
                    "hash_key": key,
                    "hash_object": sHash
                }]
Пример #34
0
        f_stop.close()
    f_stop_seg_list = f_stop_text.split('\n')
    for myword in liststr.split('/'):
        if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1:
            mywordlist.append(myword)
    return ''.join(mywordlist)


#data.head()['content'].apply(lambda x:jiebaclearText(str(x)))

data['content'] = data['content'].apply(lambda x: jiebaclearText(str(x)))
data['simhash'] = data['content'].apply(lambda x: Simhash(x).value)

train = data.loc[data['source'] == 'train']
test = data.loc[data['source'] == 'test']

train.drop('source', axis=1, inplace=True)
test.drop([
    'source',
], axis=1, inplace=True)

objs = [(row["id"], Simhash(row["content"]))
        for index, row in train.iterrows()]

index = SimhashIndex(objs, k=12)
test['result'] = test['content'].apply(
    lambda x: index.get_near_dups(Simhash(x)))

sub['result'] = test['result']
sub.to_csv('../output/simhash.csv', index=False)
        data.update(dx)

print "OBJS SIZE", len(objs)
index = SimhashIndex(objs, f=f_dim, k=3)
print "Bucket size", index.bucket_size()

accounted_keys = set()
dataset = []
C = collections.Counter()

for key, val in tqdm(objs):
    # Skip if we've seen this pattern before
    if key in accounted_keys:
        continue

    dupes = index.get_near_dups(val)
    tweet = data[key]

    # Don't report self-matches
    if len(dupes) <= 1:
        continue

    # Don't report if person repeats only themself
    unique_names = set(['_'.join(x.split('_')[:-1]) for x in dupes])
    if len(unique_names) <= 1:
        continue

    accounted_keys.update(dupes)

    for k1, k2 in itertools.combinations(unique_names, r=2):
        #dataset.append({"name1":k1,"name2":k2,"tweet":tweet})
    mimas_cursor = db_miams_eagle.cursor()
    sql = 'select id,content from api_tractate limit 100'
    mimas_cursor.execute(sql)
    data = list(mimas_cursor.fetchall())

    file = open("tractate.txt", "w", encoding="utf-8")
    for one in range(0, len(data)):
        begin = time.time()
        text1 = data[one].get("content", None)
        one_id = data[one].get("id", None)
        all_similar_data[one_id] = text1

    objs = [(str(k), Simhash(get_features(v)))
            for k, v in all_similar_data.items()]
    index = SimhashIndex(objs, k=6)
    print(index.bucket_size())

    for key, value in all_similar_data.items():
        s1 = Simhash(get_features(value))
        simi_list = index.get_near_dups(s1)
        simi_list.sort(key=lambda x: int(x), reverse=False)

        if len(simi_list) > 1 and simi_list not in all_similar_set:
            all_similar_set.append(simi_list)

    for item in all_similar_set:
        file.write(str(item))
        file.write("\n")
    print("100条数据计算的时间:%f" % (time.time() - begin))
Пример #37
0
print '---------------------------------'
print ' Wrong Operating System'
print '---------------------------------'
print len(wrongos)



#########################################################################################################################################

if(args.near.upper() == 'Y'):
    fuzzy = []
    for line in unknown:
        try:
            fuzz = Simhash(get_features(unicode(line)))
            num = index.get_near_dups(fuzz)
            if len(num) != 0:
                fuzzy.append(line)
        except:
            pass

    print '---------------------------------'
    print ' Total Fuzzy Near Matches'
    print '---------------------------------'
    print len(fuzzy)

#########################################################################################################################################

wrongpath = []

for line in unknown:
Пример #38
0
     fdoc = open(docfile, 'w')
     fcut = open(cutfile, 'w')
     num = 0
     simnum = 0
     cutnum = 0
     #simhash
     index = SimhashIndex({})
     for one in weibocollection.find(
         {'create_time': {
             '$gte': lasttimestamp,
             '$lt': curtimestamp
         }}):
         weibo_id = str(one['_id'])
         weibo_text = one['data']['text'].strip()
         text_sh = Simhash(weibo_text)
         if len(index.get_near_dups(text_sh)) == 0:  #not find sim
             #cut
             text_seg = jieba.cut(weibo_text)
             text_result = list(set(text_seg) - stopwords)
             content = ' 1 '.join(text_result)
             if content != '':
                 fdoc.write(weibo_id + '\t' + weibo_text.encode('utf-8') +
                            '\n')
                 fcut.write(content.encode('utf-8') + ' 1\n')
                 cutnum += 1
             simnum += 1
         num += 1
         index.add(num, text_sh)
 except pymongo.errors, e:
     logger.critical('mongo find error: %s' % e)
     sys.exit(-2)

def get_features(s):
    width = 3
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]

data = {
    1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
    2: u'How are you i am fine. blar blar blar blar blar than',
    3: u'This is simhash test.',
}
objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
index = SimhashIndex(objs, k=3)

print(index.bucket_size())

s1 = Simhash(get_features(u'How are you i am fine. blar blar blar blar blar thank'))
print(index.get_near_dups(s1))

index.add('4', s1)
print(index.get_near_dups(s1))


def main():
    pass


if __name__ == '__main__':
    main()
Пример #40
0
    'utf-8', 'ignore')

s2 = 'How are you i am fine. blar blar blar blar blar than'.decode(
    'utf-8', 'ignore')

s3 = 'This is simhash test.'.decode('utf-8', 'ignore')

# print get_features(s1)
#
# print Simhash(get_features('How are you? I am fine. Thanks.')).value

sh1 = Simhash(s1)
sh2 = Simhash(s2)
sh3 = Simhash(s3)

# print sh.value

# print sh1.distance(sh2)

shIndex = SimhashIndex([], k=3)
shIndex.add('1', sh1)
shIndex.add('2', sh2)
# shIndex.add('3', sh3)

if shIndex.get_near_dups(sh3):
    print 'YES'
else:
    print 'NO'

# print shIndex.get_near_dups(sh2)
Пример #41
0
s3 = 'This is simhash test.'.decode('utf-8', 'ignore')

# print get_features(s1)
#
# print Simhash(get_features('How are you? I am fine. Thanks.')).value


sh1 = Simhash(s1)
sh2 = Simhash(s2)
sh3 = Simhash(s3)

# print sh.value


# print sh1.distance(sh2)

shIndex = SimhashIndex([], k=3)
shIndex.add('1', sh1)
shIndex.add('2', sh2)
# shIndex.add('3', sh3)

if shIndex.get_near_dups(sh3):
    print 'YES'
else:
    print 'NO'

# print shIndex.get_near_dups(sh2)


Пример #42
0
def is_valid(config, robot_cache_a, robot_cache_d, robot_url_cache, mem, mem2,
             longest_page, common_dict, ics_subdomains, url):
    """
	mem = set() #memory cache of unique urls
	robot_cache_a = set() #memory cache of allowed urls
    robot_cache_d = set() #memory cache of disallowed urls
    robot_url_cache = set() #memory cache of crawled robots.txt stored as netloc
    """
    try:
        parsed = urlparse(url)
        if parsed.scheme not in set(["http", "https"]):
            return False
        else:
            url = url.replace(parsed.fragment, "")

            extbool = not re.match(
                r".*\.(css|js|bmp|gif|jpe?g|ico" +
                r"|png|tiff?|mid|mp2|mp3|mp4" +
                r"|wav|avi|mov|mpeg|ram|m4v|mkv|ogg|ogv|pdf" +
                r"|ps|eps|tex|ppt|pptx|doc|docx|xls|xlsx|names" +
                r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso" +
                r"|epub|dll|cnf|tgz|sha1|sql" + r"|thmx|mso|arff|rtf|jar|csv" +
                r"|rm|smil|wmv|swf|wma|zip|rar|gz)$", parsed.path.lower())

            extbool2 = not re.match(
                r".*\.(css|js|bmp|gif|jpe?g|ico" +
                r"|png|tiff?|mid|mp2|mp3|mp4" +
                r"|wav|avi|mov|mpeg|ram|m4v|mkv|ogg|ogv|pdf" +
                r"|ps|eps|tex|ppt|pptx|doc|docx|xls|xlsx|names" +
                r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso" +
                r"|epub|dll|cnf|tgz|sha1|sql" + r"|thmx|mso|arff|rtf|jar|csv" +
                r"|rm|smil|wmv|swf|wma|zip|rar|gz)$", parsed.query.lower())

            extbool3 = not re.match(
                r".*/(css|js|bmp|gif|jpe?g|ico" + r"|png|tiff?|mid|mp2|mp3|mp4"
                + r"|wav|avi|mov|mpeg|ram|m4v|mkv|ogg|ogv|pdf" +
                r"|ps|eps|tex|ppt|pptx|doc|docx|xls|xlsx|names" +
                r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso" +
                r"|epub|dll|cnf|tgz|sha1|sql" + r"|thmx|mso|arff|rtf|jar|csv" +
                r"|rm|smil|wmv|swf|wma|zip|rar|gz)/.*", parsed.path.lower())

            ebool = extbool and extbool2 and extbool3

            sub_bool = re.match(r"(www.)?[-a-zA-Z0-9.]*\.ics\.uci\.edu",
                                parsed.netloc)
            sub_bool2 = re.match(r"(www.)?[-a-zA-Z0-9.]*\.cs\.uci\.edu",
                                 parsed.netloc)
            sub_bool3 = re.match(
                r"(www.)?[-a-zA-Z0-9.]*\.informatics\.uci\.edu", parsed.netloc)
            sub_bool4 = re.match(r"(www.)?[-a-zA-Z0-9.]*\.stat\.uci\.edu",
                                 parsed.netloc)
            sub_bool5 = (re.match(r"(www.)?[-a-zA-Z0-9.]*today\.uci\.edu",
                                  parsed.netloc)
                         and (parsed.path
                              == "/department/information_computer_sciences/"))

            sbool = sub_bool or sub_bool2 or sub_bool3 or sub_bool4 or sub_bool5

            if (ebool and sbool):
                try:
                    if parsed.netloc not in robot_url_cache:
                        robot_url_cache.add(parsed.netloc)
                        robot_site = parsed.scheme + "://" + parsed.netloc + "/robots.txt"
                        robot_resp = download.download(robot_site,
                                                       config,
                                                       logger=None)
                        if robot_resp.status == 200:
                            robot_txt = robot_resp.raw_response.text
                            parse(parsed, robot_txt, robot_cache_a,
                                  robot_cache_d)

                    if url not in mem:
                        site_resp = requests.get(url)
                        if site_resp.status_code == 200:
                            #simhash here
                            doc = site_resp.text
                            soup = BeautifulSoup(doc, 'html.parser')
                            #filter text from site
                            [
                                s.extract() for s in soup([
                                    'style', 'script', '[document]', 'head',
                                    'title'
                                ])
                            ]
                            text_only = soup.getText()
                            filtered_text = text_only.split()

                            #LOW INFO CONTENT
                            if len(filtered_text) < 20:
                                return False

                            s = Simhash(filtered_text)
                            index = SimhashIndex(mem2)  #k=2
                            if index.get_near_dups(s) != []:
                                return False
                            else:
                                if url in robot_cache_a:
                                    check(filtered_text, common_dict,
                                          longest_page, ics_subdomains,
                                          sub_bool, parsed.netloc, url)
                                    mem.add(url)
                                    mem2.append((str(url), s))
                                    return True
                                elif url in robot_cache_d:
                                    return False
                                else:
                                    check(filtered_text, common_dict,
                                          longest_page, ics_subdomains,
                                          sub_bool, parsed.netloc, url)
                                    mem.add(url)
                                    mem2.append((str(url), s))
                                    return True
                        else:
                            return False
                    else:
                        return False
                except socket.gaierror:
                    return False
                except requests.exceptions.Timeout:
                    return False
                except requests.exceptions.TooManyRedirects:
                    return False
                except requests.exceptions.ConnectionError:
                    return False
                except requests.exceptions.RequestException:
                    return False
            else:
                return False

    except TypeError:
        #print ("TypeError for ", parsed)
        return False
Пример #43
0
class NearDuplicate:
    def __init__(self, filenames, k=2, metadata_dictionary=None):
        self.filenames = filenames
        self.simhash_index = None 
        self.image_dictionary = {}
        self.metadata_dictionary = metadata_dictionary
        self.k = k 
        # Need to store the image hashes in some fashion
        # Possibly cluster the hashes (k-means) 
    
    def tika_metadata(self, filename):
        """Use the tika-py module to grab metadata for a file"""
        parsed = parser.from_file(filename)
        return parsed.get("metadata", {})

    def exifread_metadata(self, filename):
        """Use the exifread module to grab metadata for a file"""
        f = open(filename, 'rb')
        tags = exifread.process_file(f)
        return tags

    def generate_features_from_dict(self, filename):
        """ Use this function when we provide json metadata information from
            the tika java module"""

        # Find the metadata object from the json metadata file for the image_file named 'filename'
        metadata = self.metadata_dictionary.get(filename, {})
       
        # The tags or type of metadata we want
        feature_tags = ["Image Height", "Image Width", "File Size", "Content-Type", "Image Bytes", "File Name Suffix"]

        # Create a feature array using these metadata values
        features = []

        feature_weight_dict = {
                "Image Height" : 1, 
                "Image Width" : 1,
                "Files Size" : 2,
                "Content-Type" : 3,
                "Image Bytes" : 6, 
                "File Name Suffix" :2 
        }

        # Grab the bytes of the entire file
        image_bytes = "NONE"
        try:
            image_bytes = open(filename, 'rb').read()
        except OSError:
            image_bytes = "NONE"

        # Get the central bytes 
        image_bytes_str = unicode( str(image_bytes), 'utf-8', "ignore")
        byte_offset = len(image_bytes_str)//4
        filename_suffix = filename[-10:]

        modified_metadata = {
                "Image Height" : metadata.get("Image Height", "NONE"), 
                "Image Width" : metadata.get("Image Width", "NONE"),
                "File Size" : metadata.get("File Size", "NONE"),
                "Content-Type" : metadata.get("Content-Type", "NONE"),
                "Image Bytes" : image_bytes_str[byte_offset:-byte_offset], 
                "File Name Suffix" : filename_suffix
        }
       
        # Create an array of (token, weight) tuples. These are our features and weights
        # to be used for the Simhash
        for (feature_tag, weight), (meta_tag, meta_value) in zip(feature_weight_dict.items(), 
                modified_metadata.items()):
            features.append((meta_tag + ":" + meta_value, weight))

        return features


    def generate_features(self, filename):
        """Given an image generate a feature vector"""

        """ 
            Since Tika-Py requires a server call (i.e. slower)
            Do native image metadata grabbing, and fallback on tika if the
            image can't be opened (i.e., it's an svg or gif)
        """
        im, use_tika = None, False 
        try:
            im = Image.open(filename)
            use_tika = False
        except IOError:
            use_tika = True
            
        # Grab the metadata for the image
        metadata = {} 
        
        # We'll store features to use for simhash in a tuple array [(token, weight)]
        features = []

        if use_tika:
            # Use only metadata from tika
            # The image file can't be opened using PIL.Image, so that means
            # a diff type of image besides jpg, png
            metadata = self.tika_metadata(filename)

            # Grab the bytes of the entire file
            image_bytes = open(filename).read()

            # Get the central bytes 

            image_bytes_str = unicode( str(image_bytes), 'utf-8', "ignore")
            #image_bytes_str = str(image_bytes)
            byte_offset = len(image_bytes_str)//4
            metadata["Image Bytes"] = image_bytes_str[byte_offset:-byte_offset] 
            feature_tags = ["Image Height", "Image Width", "File Size", "Content-Type", "Image Bytes"]
            features = [tag + ":" + metadata.get(tag,"NONE") for tag in feature_tags]
            return features

        """ 
            FEATURES
                We'll resize the image so all images are normalized to a certain size 
                Also make sure to retain aspect ratio

                Features to use (in order of importance)
                    - center region bytes 
                    - color histogram
                    - content type
                    - image width
                    - image height

            We can take subregions of the image, and hash those
        """

        
        # Resize the image so all images are normalized
        width = im.size[0]
        height = im.size[1]
        resize_width = 30 
        resize_height = resize_width*height/width
        resize_im = None
        histogram_bytes, histogram_weight = "", 0
        center_region_bytes, center_region_weight = "", 5
        extension = ""
        try :
            resize_im = im.resize((resize_width, resize_height), Image.ANTIALIAS)
            # Crop sub regions
            height_padding, width_padding = resize_height/5, resize_width/5
            box = (width_padding, height_padding, resize_width - width_padding, 
                    resize_height - height_padding)
            sub_region = resize_im.crop(box)
            
            # Generate a histogram
            histogram_bytes, histogram_weight = str(resize_im.histogram()), 4
            center_region_bytes, center_region_weight = str(list(sub_region.getdata())), 3
        except OSError:
            
            # Couldn't resize the image. Let's
            print >> sys.stderr, "Couldn't resize the image. Prob an eps or svg"
            resize_im = im
            resize_width = im.size[0]
            resize_height = im.size[1]
            sub_region = im

            # Grab the bytes of the entire file
            image_bytes = open(filename).read()
            # Get the central bytes 
            #image_bytes_str = str(image_bytes)
            histogram_bytes = "NONE"
            image_bytes_str = unicode( str(image_bytes), 'utf-8', "ignore")
            byte_offset = len(image_bytes_str)//4
            center_region_bytes = image_bytes_str[byte_offset:-byte_offset] 
         
        extension = resize_im.format if resize_im.format !=  None else os.path.splitext(filename)[1]
         
        # Figure out the content type (png, jpg, etc.)
        content_type = "image/" + str(extension.lower())
        
        
        feature_weight_dict = {
                "Image Height" : 1, 
                "Image Width" : 1,
                "Image Histogram" : histogram_weight,
                "Content-Type" : 5,
                "Center Region Bytes" : center_region_weight 
        }

        metadata = {
                "Image Height" : str(width), 
                "Image Width" : str(height),
                "Image Histogram" : histogram_bytes,
                "Content-Type" : content_type,
                "Center Region Bytes" : center_region_bytes 
        }
       
        # Create an array of (token, weight) tuples. These are our features and weights
        # to be used for the Simhash
        for (feature_tag, weight), (meta_tag, meta_value) in zip(feature_weight_dict.items(), 
                metadata.items()):
            features.append((meta_tag + ":" + meta_value, weight))

        return features 


    def merge_near_duplicate_dictionaries(self, nd):
        """Merge the current near duplicate instance with another near duplicate instance"""

        smaller_nd = self if len(self.image_dictionary) <= len(nd.image_dictionary) else nd
        larger_nd = self if len(self.image_dictionary) > len(nd.image_dictionary) else nd
        final_dict = larger_nd.image_dictionary

        # Iterate over the smaller near duplicate instance
        for key in smaller_nd.image_dictionary.keys():
            

            # If an exact duplicate exists, just grab it and merge them 
            if larger_nd.image_dictionary.get(key, None) != None:
                arr = smaller_nd.image_dictionary.get(key, []) +\
                        larger_nd.image_dictionary.get(key, [])
                final_dict[key] = arr
                continue

            # Find the closest near duplicate in the larger dictionary by
            # using it's index
            simhash_obj = smaller_nd.image_dictionary[key][0]["hash_object"]

            near_duplicates_keys = larger_nd.simhash_index.get_near_dups(simhash_obj)
            
            # If a near duplicate exists 
            if len(near_duplicates_keys) > 0:
                # grab the array of images at that key in the larger dictionary
                # Merge it the array of images in the smaller dictionary 
                near_dup_key = near_duplicates_keys[0]
                arr = smaller_nd.image_dictionary.get(key, []) +\
                        larger_nd.image_dictionary.get(near_dup_key, [])

                # create an entry in the new dictionary
                final_dict[near_dup_key] = arr
                continue
                
            # Otherwise we should just add this key-object from the dictionary
            # to this array
            final_dict[key] = smaller_nd.image_dictionary[key] 

            # Add this simhash to the Index for efficient searching
            larger_nd.simhash_index.add(key, simhash_obj)

        self.image_dictionary = final_dict
        self.simhash_index = larger_nd.simhash_index

        nd.image_dicionary = final_dict
        nd.simhash_index = larger_nd.simhash_index

        # Now simply return this final dict 
        return final_dict


    def simhash_value_to_key(self, simhash):
        """Given a simhash object, convert it's value to a hexadecimal key 
            This key will be used in our image_file dictionary
        """
        return str(hex(simhash.value))


    def deduplicate_images(self):
        """
            Given a list of image files "self.filenames", deduplicate the images using
            near deduplication 
        """
        # Iterate through our files
        for image_file in self.filenames:
            feature_array = []
            if self.metadata_dictionary != None:
                # Will use a java tika program to generate metadata 
                # Metadata will be a json file with {filename : metadata} objects
                feature_array = self.generate_features_from_dict(image_file)
            else:
                # Use our own function for grabbing metadata
                # Create a list of features
                feature_array = self.generate_features(image_file)
        
            # Simhash this list of features
            sHash = Simhash(feature_array)
            if self.simhash_index == None:
                # First image, so we create the index add it to the dictionary
                # And move on to next iteration
                key = self.simhash_value_to_key(sHash)

                # We will use this index to speed up the process for finding
                # nearby simhashes
                self.simhash_index = SimhashIndex([(key, sHash)], k=self.k)
                self.image_dictionary[key] = [{
                    "filename" : image_file, 
                    "hash_key" : key, 
                    "hash_object": sHash
                }] 
                continue

            near_duplicates_keys = self.simhash_index.get_near_dups(sHash)

            if len(near_duplicates_keys) > 0:
                # There are duplicates, so we should add them to the corresponding entry
                # in the file dictionary

                # Get the key for the nearest duplicate image
                near_dup_simhash_key = near_duplicates_keys[0] 

                # Get the key for this current image 
                current_simhash_key = self.simhash_value_to_key(sHash) 

                # Create an object comprised of the image filename and key
                # We'll store this in a dictionary to be used in our merge step
                current_simhash_object = {
                    "filename" : image_file, 
                    "hash_key" : current_simhash_key,
                    "hash_object" : sHash
                }
                self.image_dictionary[near_dup_simhash_key].append(current_simhash_object)
            else:
                # No duplicates, so let's create an entry in our image filename dictionary
                key = self.simhash_value_to_key(sHash)

                # Add this simhash to the Index for efficient searching
                self.simhash_index.add(key, sHash)

                # Create an object in our image file dictionary
                self.image_dictionary[key] = [{
                    "filename" : image_file, 
                    "hash_key" : key,
                    "hash_object" : sHash
                }]
Пример #44
0
import re
from simhash import Simhash, SimhashIndex
def get_features(s):
    width = 3
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]

data = {1:u'2018-02-17 0:00:00,2018-02-17 0:00:00,2018-02-17 0:00:00,86000300159583,2018-01-26 0:00:00',
2:u'2018-02-16 0:00:00,60125170993,2018-02-16 0:00:00,86000300257742,2018-01-26 0:00:00',
3:u'2018-02-15 0:00:00,60125170993,2018-02-15 0:00:00,86011600116290,2018-01-26 0:00:00',
4:u'2018-02-14 0:00:00,60125170993,2018-02-14 0:00:00,86008501214219,2018-01-26 0:00:00',
5:u'2018-02-13 0:00:00,60125170993,2018-02-13 0:00:00,86000300420496,2018-01-26 0:00:00',
6:u'2018-02-12 0:00:00,60125170993,2018-02-12 0:00:00,86000300656419,2018-01-26 0:00:00',
7:u'2018-02-11 0:00:00,60125170993,2018-02-11 0:00:00,86553802671042,2018-01-26 0:00:00'
}
objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
index = SimhashIndex(objs, k=3)

print index.bucket_size()

s1 = Simhash(get_features(u'2018-02-17 0:00:00,2018-03-17 0:00:00,2018-02-17 0:00:00,86000300159583,2018-01-26 0:00:00'))
print index.get_near_dups(s1)

index.add('0', s1)
print index.get_near_dups(s1)