Exemplo n.º 1
0
def hashdistance(str1, str2):
    hash1 = simhashpy(str1, 64)
    hash2 = simhashpy(str2, 64)

    #distance = 1 - hash1.similarity(hash2)
    #return hash1.similarity(hash2)
    print hash1, hash2
    print hash1.hamming_distance(hash2)
    print corpus.distance(hash1,hash2)
    corpus.insert(hash1)
    corpus.insert(hash2)
    print corpus.find_all(hash1)
Exemplo n.º 2
0
def hashdistance(str1, str2):
    hash1 = simhashpy(str1, 64)
    hash2 = simhashpy(str2, 64)

    #distance = 1 - hash1.similarity(hash2)
    #return hash1.similarity(hash2)
    print hash1, hash2
    print hash1.hamming_distance(hash2)
    print corpus.distance(hash1, hash2)
    corpus.insert(hash1)
    corpus.insert(hash2)
    print corpus.find_all(hash1)
Exemplo n.º 3
0
def proxy_task(id):
    try:
        html = HtmlContent.objects.get(pk=id)
        #print 'html content', html.content
        if html.status <= 1 and html.content != '':
            return html.status
    except:
        # Not find
        return HttpResponse('not find')

    the_data = ''
    try:
        req = urllib2.Request(html.url)
        req.add_header('User-Agent', USER_AGENT)
        encoding_support = ContentEncodingProcessor
        opener = urllib2.build_opener(encoding_support, urllib2.HTTPHandler)
        #print 'requesting'
        proxied_request = opener.open(req, timeout=12)
        content = proxied_request.read()
        #print 'requested'
    except urllib2.HTTPError as e:
        html.status = 3
        print 'urllib2 error'
    else:
        try:
            ucontent = content.decode('utf-8')
        except UnicodeDecodeError:
            ucontent = content.decode('gbk', 'ignore')
        #print 'parsing'
        tx = TextExtract(ucontent)
        #print 'parsed'
        html.title = tx.title
        html.content = tx.content.strip()
        if tx.content == '':
            print 'Parse html error'
            html.status = 4
        else:
            html.status = 0
            html.hash = long(simhashpy(list(Tokenize(html.content))))
            html.tags, html.summerize = summarize(html.content)
            if len(html_remove.sub('', tx.preview)) < 250:
                html.preview = TextToHtml(tx.content)
            else:
                html.preview = tx.preview

    #print html.id, html.title, html.tags, html.summerize
    html.save()
    if html.status == 0:
        print 'begin sim_index'
        #sim_index(html)

    return html.status
Exemplo n.º 4
0
def proxy_task(id):
    try:
        html = HtmlContent.objects.get(pk=id)
        #print 'html content', html.content
        if html.status <= 1 and html.content != '':
            return html.status
    except:
        # Not find
        return HttpResponse('not find')

    the_data = ''
    try:
        req = urllib2.Request(html.url)
        req.add_header('User-Agent', USER_AGENT)
        encoding_support = ContentEncodingProcessor
        opener = urllib2.build_opener(encoding_support, urllib2.HTTPHandler)
        #print 'requesting'
        proxied_request = opener.open(req, timeout=12)
        content = proxied_request.read()
        #print 'requested'
    except urllib2.HTTPError as e:
        html.status = 3
        print 'urllib2 error'
    else:
        try:
            ucontent = content.decode('utf-8')
        except UnicodeDecodeError:
            ucontent = content.decode('gbk','ignore')
        #print 'parsing'
        tx = TextExtract(ucontent)
        #print 'parsed'
        html.title = tx.title
        html.content = tx.content.strip()
        if tx.content == '':
            print 'Parse html error'
            html.status = 4
        else:
            html.status = 0
            html.hash = long(simhashpy(list(Tokenize(html.content))))
            html.tags,html.summerize = summarize(html.content)
            if len(html_remove.sub('', tx.preview)) < 250:
                html.preview = TextToHtml(tx.content)
            else:
                html.preview = tx.preview

    #print html.id, html.title, html.tags, html.summerize
    html.save()
    if html.status == 0:
        print 'begin sim_index'
        #sim_index(html)

    return html.status
Exemplo n.º 5
0
        c.insert(h)
#hash_all()

def hash_test():
    sim_server = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver'))
    dels = []
    for obj in HtmlContent.objects.filter(status=1).filter(~Q(content='')):
        dels.append('html_%d' % obj.id)
    sim_server.delete(dels)
#hash_test()

obj1 = HtmlContent.objects.get(pk=6870)
obj2 = HtmlContent.objects.get(pk=7024)
token1 = list(Tokenize(obj1.content))
token2 = list(Tokenize(obj2.content))
h1 = simhashpy(token1, 64)
h2 = simhashpy(token2, 64)
print h1,h2
print corpus.distance(h1,h2)
h1 = simhash.hash_token(token1)
h2 = simhash.hash_token(token2)
print h1,h2
print corpus.distance(h1,h2)
h1 = simhash.hash_tokenpy(token1)
h2 = simhash.hash_tokenpy(token2)
print h1,h2
print corpus.distance(h1,h2)

'''
str1 = 'test love you'
str2 = 'love you test'
Exemplo n.º 6
0
def hash_test():
    sim_server = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver'))
    dels = []
    for obj in HtmlContent.objects.filter(status=1).filter(~Q(content='')):
        dels.append('html_%d' % obj.id)
    sim_server.delete(dels)


#hash_test()

obj1 = HtmlContent.objects.get(pk=6870)
obj2 = HtmlContent.objects.get(pk=7024)
token1 = list(Tokenize(obj1.content))
token2 = list(Tokenize(obj2.content))
h1 = simhashpy(token1, 64)
h2 = simhashpy(token2, 64)
print h1, h2
print corpus.distance(h1, h2)
h1 = simhash.hash_token(token1)
h2 = simhash.hash_token(token2)
print h1, h2
print corpus.distance(h1, h2)
h1 = simhash.hash_tokenpy(token1)
h2 = simhash.hash_tokenpy(token2)
print h1, h2
print corpus.distance(h1, h2)
'''
str1 = 'test love you'
str2 = 'love you test'
t1 = str1.decode('utf-8').split()