Exemplo n.º 1
0
    def __init__(self, test_docs_dir="./test"):
        self.test_docs_dir = test_docs_dir
        self.files = []

        self.index = NearDuplicatesIndex()

        # Calculate near-duplicates index




	# Try to connect

	try:
	    conn=psycopg2.connect("dbname='djangology' user='******' password=''")
	except:
	    print "I am unable to connect to the database."

	cur = conn.cursor()
	try:
	    cur.execute("""SELECT * from dj_document""")
	except:
	    print "I can't SELECT from dj_document"

	rows = cur.fetchall()
	#print "\nRows: \n"
	for row in rows:
	    #print "   ", row[1]
            self.index.append(row[1], row[0])
	    self.files.append(row[0])
	cur.close()
Exemplo n.º 2
0
    def __init__(self, test_docs_dir="./test"):
        self.test_docs_dir = test_docs_dir
        self.files = [
            d for d in os.listdir(test_docs_dir)
            if os.path.isfile(os.path.join(test_docs_dir, d)) and d[0] != "."
        ]

        self.index = NearDuplicatesIndex()

        # Calculate near-duplicates index
        for file in self.files:
            filename = self.filename(file)
            with open(filename) as f:
                doc = f.read().strip().strip(
                    ",.!|&-_()[]<>{}/\"'").strip().split(" ")
                self.index.append(doc, filename)
Exemplo n.º 3
0
class Detector:
    def __init__(self, test_docs_dir="./test"):
        self.test_docs_dir = test_docs_dir
        self.files = [
            d for d in os.listdir(test_docs_dir)
            if os.path.isfile(os.path.join(test_docs_dir, d)) and d[0] != "."
        ]

        self.index = NearDuplicatesIndex()

        # Calculate near-duplicates index
        for file in self.files:
            filename = self.filename(file)
            with open(filename) as f:
                doc = f.read().strip().strip(
                    ",.!|&-_()[]<>{}/\"'").strip().split(" ")
                self.index.append(doc, filename)

    # Public: returns the full relative path from the base dir of the project
    #         to the filename input
    #
    # filename - the filename relative to the test directory
    #
    # Returns full filename (including test directory)
    def filename(self, filename):
        return "%s/%s" % (self.test_docs_dir, filename)

    # Public: checks for near-duplicates in the set of files based on jaccard
    #         coefficient threshold of 0.5
    #
    # Returns a string containing formatted names and coefficients of
    #   documents whose jaccard coefficient is greater than 0.5
    def check_for_duplicates(self):
        matches = []
        for indx1, f1 in enumerate(self.files):
            file1 = self.filename(f1)
            for indx2, f2 in enumerate(self.files[indx1 + 1:]):
                file2 = self.filename(f2)
                jaccard = self.index.get_jaccard(file1, file2)
                if jaccard > 0.5:
                    matches.append(
                        "%s and %s are near-duplicates, with Jaccard value of %0.3f."
                        % (f1, f2, jaccard))
        return "\n".join(matches)
Exemplo n.º 4
0
    def __init__(self, test_docs_dir="./test"):
        self.test_docs_dir = test_docs_dir
        self.files = [d for d in os.listdir(test_docs_dir) if os.path.isfile(os.path.join(test_docs_dir, d)) and d[0] != "." ]

        self.index = NearDuplicatesIndex()

        # Calculate near-duplicates index
        for file in self.files:
            filename = self.filename(file)
            with open(filename) as f:
                doc = f.read().strip().strip(",.!|&-_()[]<>{}/\"'").strip().split(" ")
                self.index.append(doc, filename)
Exemplo n.º 5
0
class Detector:
    def __init__(self, test_docs_dir="./test"):
        self.test_docs_dir = test_docs_dir
        self.files = [d for d in os.listdir(test_docs_dir) if os.path.isfile(os.path.join(test_docs_dir, d)) and d[0] != "." ]

        self.index = NearDuplicatesIndex()

        # Calculate near-duplicates index
        for file in self.files:
            filename = self.filename(file)
            with open(filename) as f:
                doc = f.read().strip().strip(",.!|&-_()[]<>{}/\"'").strip().split(" ")
                self.index.append(doc, filename)

    # Public: returns the full relative path from the base dir of the project
    #         to the filename input
    #
    # filename - the filename relative to the test directory
    #
    # Returns full filename (including test directory)
    def filename(self, filename):
        return "%s/%s" % (self.test_docs_dir, filename)

    # Public: checks for near-duplicates in the set of files based on jaccard
    #         coefficient threshold of 0.5
    #
    # Returns a string containing formatted names and coefficients of 
    #   documents whose jaccard coefficient is greater than 0.5
    def check_for_duplicates(self):
        matches = []
        for indx1, f1 in enumerate(self.files):
            file1 = self.filename(f1)
            for indx2, f2 in enumerate(self.files[indx1+1:]):
                file2 = self.filename(f2)
                jaccard = self.index.get_jaccard(file1, file2)
                if jaccard > 0.5:
                    matches.append("%s and %s are near-duplicates, with Jaccard value of %0.3f." % (f1, f2, jaccard))
        return "\n".join(matches)
Exemplo n.º 6
0
 def setUp(self):
     self.docs = []
     self.docs.append(['this','is','a','document'])
     self.docs.append(['this','is','b','document'])
     self.index = NearDuplicatesIndex()
Exemplo n.º 7
0
class TestNearDuplicatesIndex(unittest.TestCase):
    def setUp(self):
        self.docs = []
        self.docs.append(['this','is','a','document'])
        self.docs.append(['this','is','b','document'])
        self.index = NearDuplicatesIndex()

    def test_should_allow_to_append_documents(self):
        self.index.append(self.docs[0], 'doc1')
        self.index.append(self.docs[1], 'doc2')
        self.assertEqual(len(self.index), 2)

    def test_should_raise_an_error_when_docname_is_duplicated(self):
        self.index.append(self.docs[0], 'doc1')
        with self.assertRaises(Exception):
            self.index.append(self.docs[1], 'doc1')

    def test_should_calculate_jaccard_coefficient(self):
        self.index.append(self.docs[0], 'doc1')
        self.index.append(self.docs[0], 'doc2')
        self.assertEqual(self.index.get_jaccard('doc1', 'doc2'), 1.0)

    def test_should_raise_an_error_if_document_does_not_exist(self):
        with self.assertRaises(Exception):
            self.index.get_jaccard('doc1', 'doc3')

    def test_should_append_a_document_if_its_not_duplicated(self):
        self.index.append(self.docs[0], 'doc1')
        self.index.appendif(self.docs[1], 'doc2', 1.0)
        self.assertEqual(len(self.index), 2)

    def test_should_not_append_a_document_if_its_duplicated(self):
        self.index.append(self.docs[0], 'doc1')
        self.index.appendif(self.docs[1], 'doc2', -1.0)
        self.assertEqual(len(self.index), 1)
Exemplo n.º 8
0
class Detector:
    def __init__(self, test_docs_dir="./test"):
        self.test_docs_dir = test_docs_dir
        self.files = []

        self.index = NearDuplicatesIndex()

        # Calculate near-duplicates index




	# Try to connect

	try:
	    conn=psycopg2.connect("dbname='djangology' user='******' password=''")
	except:
	    print "I am unable to connect to the database."

	cur = conn.cursor()
	try:
	    cur.execute("""SELECT * from dj_document""")
	except:
	    print "I can't SELECT from dj_document"

	rows = cur.fetchall()
	#print "\nRows: \n"
	for row in rows:
	    #print "   ", row[1]
            self.index.append(row[1], row[0])
	    self.files.append(row[0])
	cur.close()

        #for file in self.files:
         #   filename = self.filename(file)
          #  with open(filename) as f:
           #     doc = f.read().strip().strip(",.!|&-_()[]<>{}/\"'").strip().split(" ")
                #self.index.append(doc, filename)

    # Public: returns the full relative path from the base dir of the project
    #         to the filename input
    #
    # filename - the filename relative to the test directory
    #
    # Returns full filename (including test directory)
    def filename(self, filename):
        return "%s/%s" % (self.test_docs_dir, filename)

    # Public: checks for near-duplicates in the set of files based on jaccard
    #         coefficient threshold of 0.5
    #
    # Returns a string containing formatted names and coefficients of 
    #   documents whose jaccard coefficient is greater than 0.5
    def check_for_duplicates(self):
        matches = []
        for indx1, f1 in enumerate(self.files):
            for indx2, f2 in enumerate(self.files[indx1+1:]):
                jaccard = self.index.get_jaccard(f1, f2)
                if jaccard > 0.5:
                    matches.append("%s and %s are near-duplicates, with Jaccard value of %0.3f." % (f1, f2, jaccard))
        return "\n".join(matches)