def __init__(self, test_docs_dir="./test"): self.test_docs_dir = test_docs_dir self.files = [] self.index = NearDuplicatesIndex() # Calculate near-duplicates index # Try to connect try: conn=psycopg2.connect("dbname='djangology' user='******' password=''") except: print "I am unable to connect to the database." cur = conn.cursor() try: cur.execute("""SELECT * from dj_document""") except: print "I can't SELECT from dj_document" rows = cur.fetchall() #print "\nRows: \n" for row in rows: #print " ", row[1] self.index.append(row[1], row[0]) self.files.append(row[0]) cur.close()
def __init__(self, test_docs_dir="./test"): self.test_docs_dir = test_docs_dir self.files = [ d for d in os.listdir(test_docs_dir) if os.path.isfile(os.path.join(test_docs_dir, d)) and d[0] != "." ] self.index = NearDuplicatesIndex() # Calculate near-duplicates index for file in self.files: filename = self.filename(file) with open(filename) as f: doc = f.read().strip().strip( ",.!|&-_()[]<>{}/\"'").strip().split(" ") self.index.append(doc, filename)
def setUp(self): self.docs = [] self.docs.append(['this','is','a','document']) self.docs.append(['this','is','b','document']) self.index = NearDuplicatesIndex()