Пример #1
0
	def getTextualHash(self, driver):
		"""
		returns a textual hash of the url passed
		"""
		#get the html source
		html_source = driver.page_source
		#create and update our nilsimsa object with the source
		nilsimsaObj = Nilsimsa()
		nilsimsaObj.update(html_source)
		return nilsimsaObj.hexdigest()
Пример #2
0
def test_nilsimsa():
    """
    tests the nilsimsa hash by choosing a random test file
    computes the nilsimsa digest and compares to the true
    value stored in the pickled sid_to_nil dictionary
    """
    fname = random.choice(dircache.listdir(test_data_dir))
    f = open(os.path.join(test_data_dir, fname), "rb")
    nil = Nilsimsa(f.read())
    f.close()
    assert nil.hexdigest() == sid_to_nil[fname.split(".")[0]]
Пример #3
0
def test_nilsimsa():
    """
    tests the nilsimsa hash by choosing a random test file
    computes the nilsimsa digest and compares to the true
    value stored in the pickled sid_to_nil dictionary
    """
    fname = random.choice(dircache.listdir(test_data_dir))
    f = open(os.path.join(test_data_dir, fname), "rb")
    nil = Nilsimsa(f.read())
    f.close()
    assert nil.hexdigest() == sid_to_nil[fname.split(".")[0]]
Пример #4
0
	def getNilsimsaHash(self, url, call_phantom=True):
		if call_phantom: self.setUpGetter(url)
		# if not output file exists, then the page failed to load
		if not os.path.isfile("{}-output.txt".format(self.id)):
			return -1
		#create and update our nilsimsa object with the source
		try:
			with open("{}-output.txt".format(self.id), "rb") as f:
				nilsimsaObj = Nilsimsa(f.read())
			#nilsimsaObj.from_file("output.txt")
			self.nilsimsa_hash = nilsimsaObj.hexdigest()
		except Exception as e:
			print(e)
		finally:
			# always remove the old file even if an exception is thrown
			os.remove('{}-output.txt'.format(self.id))
			#test = True
		return self.nilsimsa_hash
Пример #5
0
 def calc_nilsimsa(self, gold_surface_form, comp_surface_form):
     nil_0 = Nilsimsa(gold_surface_form)
     nil_1 = Nilsimsa(comp_surface_form)
     nil = compare_digests(nil_0.hexdigest(), nil_1.hexdigest())
     return nil
Пример #6
0
def get_ads(base_url):
    c = conn.cursor()

    page = download.get(base_url + "/search/cpg")

    for p in page.select(".row"):
        pid = p['data-pid']

        a_tag = p.find('a', class_='hdrlnk')
        ad_href = a_tag['href']
        ad_title = a_tag.text

        dt = p.find('time')['datetime']
        dt = datetime.datetime.strptime(dt, "%Y-%m-%d %H:%M")
        dt = int(dt.strftime("%s"))

        c.execute("SELECT * FROM ad WHERE id = ?", (pid, ))
        row = c.fetchone()

        if row is None:
            url = ad_href
            if not ad_href.startswith('http'):
                url = base_url + ad_href

            time.sleep(0.5)
            ad = download.get(url)

            print url
            ad_text = ad.find(id='postingbody')
            if ad_text is None:
                if ad.find(id='has_been_removed'):
                    continue
                else:
                    raise "malformed body"
            ad_text = ad_text.text.strip()

            ad_text = filter(lambda x: x in string.printable, ad_text)
            nilsimsa = Nilsimsa(ad_text)
            lshash = nilsimsa.hexdigest()

            # c.execute("SELECT * FROM ad")
            # row = c.fetchone()
            # while row:
            #     diff = nilsimsa.compare(row[4], True)
            #     if diff < 10:
            #         print diff
            #         print cache.get("text:" + row[0])
            #         print "----"
            #         print ad_text
            #         sys.exit()

            seen = generate_word_counts(ad_text)

            cache.write("text:" + pid, ad_text)

            row = (pid, url, ad_title, dt, lshash)
            c.execute(
                "INSERT INTO ad (id, url, title, posted, lshash) " +
                " VALUES (?,?,?,?,?)", row)

            for word in seen:
                if word not in stopwords:
                    row = (pid, word, seen[word])
                    c.execute(
                        "INSERT INTO tf (id, word, cnt) " + "VALUES (?,?,?)",
                        row)
            conn.commit()
Пример #7
0
 def compute_hash(self, text):
     from nilsimsa import Nilsimsa
     result = Nilsimsa(data=text)
     result = result.hexdigest()
     return str(result)
Пример #8
0
def test_unicode():
    """
    ensures that feeding unicode to Nilsimsa behaves gracefully
    """
    nil = Nilsimsa(u'\u1F631')
    assert nil.hexdigest()
Пример #9
0
import getCodeFragment
import os
try:
    import cPickle as pickle
except ImportError:
    import pickle

test_data_dir = os.path.join(os.path.dirname(__file__),
                             "nilsimsa\\test_data\\")
test_data = "test_dict.p"
test_dict = os.path.join(test_data_dir, test_data)
sid_to_nil = pickle.load(open(test_dict, "rb"))
# print sid_to_nil

nil = Nilsimsa('0' * 64)
s1 = nil.hexdigest()
nil = Nilsimsa('0' * 63 + '1')
s2 = nil.hexdigest()
print s1, s2
print compare_digests(s1, s2)

# for i in range(1,30):
#     cloneGroup = getCodeFragment.getCloneClass('1.2.txt', i)
#     s1 = Nilsimsa(cloneGroup[0]).hexdigest()
#     s2 = Nilsimsa(cloneGroup[1]).hexdigest()
#     #print s1,s2
#     print compare_digests(s1,s2)
#     if compare_digests(s1,s2) <0:
#         getCodeFragment.printCloneClass('1.2.txt', i)

# for i in range(1,50):