def test_reconnect(): storage = Storage(encryptDocuments=False) # Temporarily set the Config details to a different server, without saving the Config. Config().MongoServer = "127.0.0.1:27017" # The storage details should not yet have changed. assert Config().MongoServer != "%s:%s" % (storage.getHost(), storage.getPort()) storage.reconnect() # The storage details should have changed. assert Config().MongoServer == "%s:%s" % (storage.getHost(), storage.getPort())
def upload(title, author, category, data, documentClass=None): if documentClass is None: documentClass = Document error = None filetypes = {"docx": "docx", "txt": "plain", "pdf": "plain"} f = data filename = f.filename extension = filename.split(".")[-1] if extension not in filetypes.keys(): error = "Not an allowed format." return None, error document = Document() document.title = title content = f.stream.read() document.contents = Pandoc().convert(filetypes[extension], "markdown_github", content, filetype=extension) document.category = category document.author = author # Category detection if document.category == "detect": classifier = pickle.load( open( os.path.join(Config().WebAppDirectory, "..", "..", "Classifier.cpic"), "rb")) vectorizer = pickle.load( open( os.path.join(Config().WebAppDirectory, "..", "..", "Vectorizer.cpic"), "rb")) stopwords = Document.getStopwords() plainContents = Pandoc().convert("markdown_github", "plain", document.contents).lower() for stopword in stopwords: plainContents = plainContents.replace(stopword, "") matrix = vectorizer.transform([plainContents]) print matrix print "Classifying" document.category = classifier.predict(matrix)[0] #document.countWords() document.vanillaSave() print "Document saved", document._id return document, error
def learnArticles(self, documents): # This regex matches everything after the initial article number. articles = {} for _id,document in documents.items(): contents = self.getDocumentContents(document) if contents is None: continue for article in self.articleRegex.finditer( contents ): normArticle = article.group(0).strip(",.() ").replace("art.", "artikel").replace("Artikel","artikel") articles[normArticle] = article strafrechtArticles = [ art for art in articles if "strafrecht" in art.lower()] with open(Config().WebAppDirectory+"/../../wbvstrafrecht.txt") as f: wbvstrafrecht = f.read().decode('ascii',errors="replace") for art in strafrechtArticles: numbers = list(set([ "".join(nr) for nr in re.findall("(\d+|[IVX]+)([:\.]\d+)?([a-z]+)?",art)])) for number in numbers: key = "**artikel "+number try: #print key index = wbvstrafrecht.lower().index(key) print wbvstrafrecht[index:wbvstrafrecht.index("**A", index+20)] print "-"*80 except: pass
def createUserKeyPair(self, password): password = password.encode('utf-8') salt = Random.new().read(32) print "Generating Scrypt hash" t = time.time() derived_key = scrypt.hash(password, salt, N=1 << Config().scryptDifficulty)[:32] print "Took %s s." % (time.time() - t) rsakey = RSA.generate(2048, Random.new().read) self.password_salt = base64.b64encode(salt) self.public_key = base64.b64encode(rsakey.publickey().exportKey()) iv = Random.new().read(AES.block_size) cipher = AES.new(derived_key, AES.MODE_CBC, iv) self.private_key_iv = base64.b64encode(iv) private_key = rsakey.exportKey() paddingChar = "{" self.private_key = base64.b64encode( cipher.encrypt(private_key + (AES.block_size - len(private_key) % AES.block_size) * paddingChar)) del private_key del derived_key
def run(self): Config( self.CONFIG ) # Initialize the Config before switching to the webapp directory to make sure it gets loaded correctly. self.trainQueryEngine() #self.startLocal() self.startCompressed()
def __init__(self, encryptDocuments=True): """ Initializes the connection with the Mongo database. As this is a Singleton, this is only done **once** per instance of GuardStore resulting in lower connection time overhead (unless `reconnect` is called.) This might pose problems if the ongoing connection is forcefully closed for whatever reason. """ dbAddress = Config().MongoServer dbAddressComponents = dbAddress.split(":") host = dbAddressComponents[0] port = 27017 # The default MongoDB port. try: port = int(dbAddressComponents[1]) except: pass self.__client = MongoClient(host, port) self.__currentCollection = None self.__currentDatabase = None self.__encryptDocuments = encryptDocuments if encryptDocuments: try: self.iv = base64.b64decode(Config().iv) except: self.iv = Random.new().read(AES.block_size) Config().iv = base64.b64encode(self.iv) Config().save() self.paddingChar = "{" self.cipher = AES.new(Config().encryptionKey, AES.MODE_CBC, self.iv) self.instantiated = True
def hashedWordCount(self, plainWordCount, results): b64enc = base64.b64encode hashedWordCount = collections.defaultdict(int) salt = str(Config().database) for word, count in plainWordCount: key = b64enc( scrypt.hash(str(word), salt, N=1 << self.scryptHashFactor)) hashedWordCount[key] = count results += dict(hashedWordCount).items()
def __loopTermFrequencies(self, termFrequencies, tfidf, documentCount, wordCountsByKey): # This function is paralellized due to the large amount of words and long CPU times. for word, (count, tf) in termFrequencies: key = base64.b64encode( scrypt.hash(str(word), str(Config().database), N=1 << self.scryptHashFactor)) idf = math.log( float(documentCount) / (1 + wordCountsByKey.get(key, 0))) tfidf[word] = (idf * tf, key)
def setPassword(self, password): hashedPassword = bcrypt.hashpw( password.encode('utf-8'), bcrypt.gensalt(Config().bcryptDifficulty)) if self.checkPassword(password): return False self.password = hashedPassword self.createUserKeyPair(password) del password return True
def __encryptDocument(self, document): """ Encrypts a dictionary/document for before storage. Skips the ID field for retreival purposes. Field names are left intact. Structural information will also be maintained. :param document: The document that should be encrypted. This is a normal MongoDB document (dictionary). :returns: The MongoDB document with encrypted, base64 encoded values. """ if isinstance(document, list): encryptedDocument = [] for value in document: if isinstance(value, list) or isinstance(value, dict): encryptedDocument.append(self.__encryptDocument(value)) continue cipher = AES.new(Config().encryptionKey, AES.MODE_CBC, self.iv) paddedValue = self.__padObject(value) encryptedValue = cipher.encrypt(paddedValue) encryptedDocument.append(base64.b64encode(encryptedValue)) elif isinstance(document, dict): encryptedDocument = {} for key, value in document.items(): if key == "_id": # Do not encrypt _id field. encryptedDocument["_id"] = value continue if isinstance(value, list) or isinstance(value, dict): encryptedDocument[key] = self.__encryptDocument(value) continue cipher = AES.new(Config().encryptionKey, AES.MODE_CBC, self.iv) paddedValue = self.__padObject(value) encryptedValue = cipher.encrypt(paddedValue) encryptedDocument[key] = base64.b64encode(encryptedValue) else: raise ValueError, "Not a valid encryptable object." return encryptedDocument
def translate(self, key, language="default"): """ Takes a phrase and translates it to a given language based on the internationalization file. """ localizationPath = os.path.join(Config().WebAppDirectory, "../..", "localization.json") if self.localizationDict is None: with open(localizationPath, "r") as ld: self.localizationDict = dict(json.load(ld)) if language == "default": language = Config().language if language == "en_US": return key translation = self.localizationDict.get(language, {}).get(key, key + "**") if translation.endswith("**") and language in self.localizationDict: self.localizationDict[language][key + "**"] = "NEEDSTRANSLATION" with open(localizationPath, "w") as ld: json.dump(self.localizationDict, ld, indent=4) return translation
def generateJurisprudenceDocuments(documentClass=None): print documentClass if documentClass is None: documentClass = Document """ This method needs to be changed to use the new mongo sample dump instead of the old jurisprudence.json. """ return False jurisprudence = json.load( open( os.path.join(Config().WebAppDirectory, "..", "..", "jurisprudence.json"), "r")) stopwords = Document.getStopwords() categorized = {} wiki = jurisprudence["Yuras"]["wiki"] flatWiki = Document.__flatten(wiki) dCounter = 1 dTotal = len(flatWiki) for key, document in flatWiki.items(): title = key.split("_")[-1].strip(".html").split("/")[-1].replace( "-", " ") title = title[0].upper() + title[1:] key = key.split("_")[0] if key not in categorized: categorized[key] = [] if len(Category().getObjectsByKey("name", key)) == 0: c = Category() c.name = key c.save() if len(documentClass().getObjectsByKey("title", title)) == 0: print documentClass d = documentClass() print d print "Encrypting:", d._encrypt d.title = title d.category = key d.contents = Pandoc().convert("html", "markdown_github", document) d.author = "Yuras" d.document_type = "jurisprudence" if getattr(d, "_encrypt", True): d.countWords() d.vanillaSave() categorized[key].append(document) print title, "saved", dCounter, "/", dTotal dCounter += 1
def getArticlesFromContent(self): articleRegexString = "([Aa]rtikel(en)?|art\.) ((\d+|[IVX]+)([:\.]\d+)?([a-z]+)?( en |, ?)?)+ ?(([etdvzan][ewic][a-z]+?((d|st)e[^a-z]))?(lid|paragraaf|volzin)( \d+)?([, ])*)*((van|het|de|Wet|wet) )*( ?(([A-Z]([A-Z]{1,4}|[a-z]{1,2}))[^\w]) ?(\d{4})?|([\w\-]+ ?)+ ?(\d{4})?)" articleRegex = re.compile(articleRegexString) results = set() for result in articleRegex.finditer(self.contents): results.add(result.group(0).strip(",. ")) with open( os.path.join(Config().WebAppDirectory, "..", "..", "wetboeken.csv")) as wetboekencsv: wetboeken = [ wb.strip(";").split(";") for wb in wetboekencsv.read().split("\n") ] filteredArticles = [] filterwords = ["onder", "lid", "alinea"] for result in results: suffix = [g for g in result if g is not None][-2] if True not in [f in suffix for f in filterwords ] and suffix not in list("0123456789"): found = False if suffix == "deze wet": articleName = " ".join([result[0], result[3], wetboek[0]]) print articleName filteredArticles.append(articleName) continue for wetboek in wetboeken: if found: break for wb in wetboek: if wb in suffix: found = True articleName = " ".join( [result[0], str(result[2]), wetboek[0]]) print articleName filteredArticles.append(articleName) self.articles = filteredArticles super(Document, self).save() return filteredArticles
def __init__(self, database=None, collection=None, name=""): """ Sets up the object :param database: Optional, the database where this object is to be stored. Defaults to the database stored in Config. :param collection: Optional, the collecion where this object is to be stored. :param name: The pretty name of this object. """ if database is None: self._database = Config().database else: self._database = database self._collection = collection self.__storage = Storage() self.name = name self._created = datetime.datetime.now() self._type = self.__class__.__name__ if not hasattr(self, "_encrypt"): self._encrypt = True
def search(query): """ Returns the results, the query that was used by the QueryEngine and the facets """ matchedObjects = {} for word in words: key = base64.b64encode( scrypt.hash(str(word), str(Config().database), N=1 << self.scryptHashFactor)) matchedObjects["tags." + key] = {"$exists": True} results = self.matchObjects(matchedObjects, fields={ "_id": True, "title": True }) for result in results: result.markedWords = "No highlighting available for secured documents." return results, query, {}
def quickSearch(words): matchedObjects = {} for word in words: key = base64.b64encode( scrypt.hash(str(word), str(Config().database), N=1 << self.scryptHashFactor)) matchedObjects["tags." + key] = {"$exists": True} results = self.matchObjects(matchedObjects, fields={ "_id": True, "title": True }) return json.dumps( dict([(str(document._id), document.title) for document in results]))
def trainQueryEngine(self): qe = QueryEngine() if qe.SpellingEngine is None and self.TRAIN_QE: print "Training SpellingEngine with up to %s documents." % self.TRAIN_QE_SIZE se = SpellingEngine() se.model = se.trainWithDatabaseDocuments(limit=self.TRAIN_QE_SIZE) qe.SpellingEngine = se else: print "Not training SpellingEngine." if qe.ThesaurusEngine is None: print "Training ThesaurusEngine." te = ThesaurusEngine() with open( os.path.join(Config().WebAppDirectory, "..", "..", "thesaurus.txt")) as thesaurusFile: thesaurus = thesaurusFile.read() te.parseOpentaalThesaurus(thesaurus) qe.ThesaurusEngine = te
def matchObjects(self, match, limit=None, skip=0, fields=None, sort=None, reverse=False): """ This method allows you to match a StoredOject directly. It allows for more advanced queries. :param match: A query dictionary. :param limit: The maximum amount of objects to return. Will return all results by default. :param skip: The amount of objects to skip, basically an offset. :param fields: The fields to return for this object. :param sort: The documents are sorted by the given indices. This will be slower on encrypted documents, as they are sorted in Python instead of in the database. :param reverse: Whether or not documents are returned in reverse. False by default. :rtype: All the matching objects stored in the database. """ if fields is None: fields = {} storage = self.__storage database = self._database collection = self._collection if database is None or collection is None: raise ValueError, "The object needs to be assigned a database and a collection." storage.getDatabase(database) storage.getCollection(collection) # Always try to add the _encrypt key, to ensure unencrypted documents won't have an attempted decryption if 0 not in fields.values() and False not in fields.values( ) and len(fields) > 0: fields["_encrypt"] = True if getattr(self, "_encrypt", True) and Config().encryptDocuments: sortDecrypted = True documents = storage.getDocuments(match, limit, skip, fields, sort=None) else: sortDecrypted = False documents = storage.getDocuments(match, limit, skip, fields, sort=sort, _encrypted=False) if sort is not None and sortDecrypted: documents.sort(key=lambda d: self.__multi_get(d, sort, default=""), reverse=reverse) else: if reverse: documents.reverse() objects = [self.loadFromRawData(data) for data in documents] return objects
def storeWebAppDirectories(self): """ Stores the webapp directories into the config. """ Config().WebAppDirectory = os.path.join(os.getcwd()) Config().TemplatesDirectory = os.path.join(os.getcwd(), "templates") Config().save()
def test_storeWebAppDirectories(): s = Server() s.storeWebAppDirectories() assert hasattr(Config(), "RootDirectory")
def getStopwords(): with open( os.path.join(Config().WebAppDirectory, "../..", "stopwords.txt"), "r") as swf: stopwords = swf.read().split("\n") return stopwords
def getMostCommonPasswords(n=10000): with open( os.path.join(Config().WebAppDirectory, "../..", "common-passwords.txt"), "r") as cpw: passwords = cpw.read().split("\n") return passwords[:n]
def __init__(self): self.rechtspraakFolder = Config().WebAppDirectory+"/../../rechtspraak" self.jsonFileName = "10krecords.json" self.delimitor = re.compile("[^a-z]*") self.articleRegex = re.compile(r"([Aa]rtikel(en)?|art\.) ((\d+|[IVX]+)([:\.]\d+)?([a-z]+)?( en |, ?)?)+ ?(([etdvzan][ewic][a-z]+?((d|st)e[^a-z]))?(lid|paragraaf|volzin)( \d+)?([, ])*)*((van|het|de|Wet|wet) )*( ?(([A-Z]([A-Z]{1,4}|[a-z]{1,2}))[^\w]) ?(\d{4})?|([\w\-]+ ?)+ ?(\d{4})?)")
def __resetCipher(self): """ This resets the cipher to a new AES instance, foregoing all the PyCrypto rituals and Python instantion overhead.""" c = self.cipher c._cipher = _AES.new(Config().encryptionKey, AES.MODE_CBC, self.iv) return c