예제 #1
0
    def __init__(self, test_docs_dir="./test"):
        self.test_docs_dir = test_docs_dir
        self.files = []

        self.index = NearDuplicatesIndex()

        # Calculate near-duplicates index




	# Try to connect

	try:
	    conn=psycopg2.connect("dbname='djangology' user='******' password=''")
	except:
	    print "I am unable to connect to the database."

	cur = conn.cursor()
	try:
	    cur.execute("""SELECT * from dj_document""")
	except:
	    print "I can't SELECT from dj_document"

	rows = cur.fetchall()
	#print "\nRows: \n"
	for row in rows:
	    #print "   ", row[1]
            self.index.append(row[1], row[0])
	    self.files.append(row[0])
	cur.close()
예제 #2
0
    def __init__(self, test_docs_dir="./test"):
        self.test_docs_dir = test_docs_dir
        self.files = [
            d for d in os.listdir(test_docs_dir)
            if os.path.isfile(os.path.join(test_docs_dir, d)) and d[0] != "."
        ]

        self.index = NearDuplicatesIndex()

        # Calculate near-duplicates index
        for file in self.files:
            filename = self.filename(file)
            with open(filename) as f:
                doc = f.read().strip().strip(
                    ",.!|&-_()[]<>{}/\"'").strip().split(" ")
                self.index.append(doc, filename)
예제 #3
0
 def setUp(self):
     self.docs = []
     self.docs.append(['this','is','a','document'])
     self.docs.append(['this','is','b','document'])
     self.index = NearDuplicatesIndex()