示例#1
0
def test_reconnect():
    storage = Storage(encryptDocuments=False)
    # Temporarily set the Config details to a different server, without saving the Config.
    Config().MongoServer = "127.0.0.1:27017"
    # The storage details should not yet have changed.
    assert Config().MongoServer != "%s:%s" % (storage.getHost(),
                                              storage.getPort())
    storage.reconnect()
    # The storage details should have changed.
    assert Config().MongoServer == "%s:%s" % (storage.getHost(),
                                              storage.getPort())
示例#2
0
    def upload(title, author, category, data, documentClass=None):
        if documentClass is None:
            documentClass = Document

        error = None
        filetypes = {"docx": "docx", "txt": "plain", "pdf": "plain"}

        f = data
        filename = f.filename
        extension = filename.split(".")[-1]
        if extension not in filetypes.keys():
            error = "Not an allowed format."
            return None, error

        document = Document()
        document.title = title
        content = f.stream.read()
        document.contents = Pandoc().convert(filetypes[extension],
                                             "markdown_github",
                                             content,
                                             filetype=extension)
        document.category = category
        document.author = author

        # Category detection
        if document.category == "detect":
            classifier = pickle.load(
                open(
                    os.path.join(Config().WebAppDirectory, "..", "..",
                                 "Classifier.cpic"), "rb"))
            vectorizer = pickle.load(
                open(
                    os.path.join(Config().WebAppDirectory, "..", "..",
                                 "Vectorizer.cpic"), "rb"))
            stopwords = Document.getStopwords()

            plainContents = Pandoc().convert("markdown_github", "plain",
                                             document.contents).lower()
            for stopword in stopwords:
                plainContents = plainContents.replace(stopword, "")

            matrix = vectorizer.transform([plainContents])
            print matrix
            print "Classifying"
            document.category = classifier.predict(matrix)[0]

        #document.countWords()
        document.vanillaSave()
        print "Document saved", document._id

        return document, error
	def learnArticles(self, documents):
		# This regex matches everything after the initial article number.
		articles = {}
		
		for _id,document in documents.items():
			contents = self.getDocumentContents(document)
			if contents is None:
				continue
				
			for article in self.articleRegex.finditer( contents ):
				normArticle = article.group(0).strip(",.() ").replace("art.", "artikel").replace("Artikel","artikel")
				articles[normArticle] = article
				
		strafrechtArticles = [ art for art in articles if "strafrecht" in art.lower()]
		
		with open(Config().WebAppDirectory+"/../../wbvstrafrecht.txt") as f:
			wbvstrafrecht = f.read().decode('ascii',errors="replace")
		
		for art in strafrechtArticles:
			numbers = list(set([ "".join(nr) for nr in re.findall("(\d+|[IVX]+)([:\.]\d+)?([a-z]+)?",art)]))
			for number in numbers:
				key = "**artikel "+number
				
				try:
					#print key
					index = wbvstrafrecht.lower().index(key)
					print wbvstrafrecht[index:wbvstrafrecht.index("**A", index+20)]
					print "-"*80
				except:
					pass
示例#4
0
    def createUserKeyPair(self, password):
        password = password.encode('utf-8')
        salt = Random.new().read(32)

        print "Generating Scrypt hash"
        t = time.time()
        derived_key = scrypt.hash(password,
                                  salt,
                                  N=1 << Config().scryptDifficulty)[:32]
        print "Took %s s." % (time.time() - t)

        rsakey = RSA.generate(2048, Random.new().read)

        self.password_salt = base64.b64encode(salt)
        self.public_key = base64.b64encode(rsakey.publickey().exportKey())
        iv = Random.new().read(AES.block_size)

        cipher = AES.new(derived_key, AES.MODE_CBC, iv)
        self.private_key_iv = base64.b64encode(iv)
        private_key = rsakey.exportKey()
        paddingChar = "{"
        self.private_key = base64.b64encode(
            cipher.encrypt(private_key +
                           (AES.block_size -
                            len(private_key) % AES.block_size) * paddingChar))

        del private_key
        del derived_key
示例#5
0
 def run(self):
     Config(
         self.CONFIG
     )  # Initialize the Config before switching to the webapp directory to make sure it gets loaded correctly.
     self.trainQueryEngine()
     #self.startLocal()
     self.startCompressed()
示例#6
0
    def __init__(self, encryptDocuments=True):
        """ Initializes the connection with the Mongo database. 
		As this is a Singleton, this is only done **once** per instance of GuardStore resulting in lower connection time overhead (unless `reconnect` is called.)
		This might pose problems if the ongoing connection is forcefully closed for whatever reason.
		"""

        dbAddress = Config().MongoServer
        dbAddressComponents = dbAddress.split(":")
        host = dbAddressComponents[0]
        port = 27017  # The default MongoDB port.

        try:
            port = int(dbAddressComponents[1])
        except:
            pass

        self.__client = MongoClient(host, port)
        self.__currentCollection = None
        self.__currentDatabase = None
        self.__encryptDocuments = encryptDocuments

        if encryptDocuments:
            try:
                self.iv = base64.b64decode(Config().iv)
            except:
                self.iv = Random.new().read(AES.block_size)
                Config().iv = base64.b64encode(self.iv)
                Config().save()

            self.paddingChar = "{"
            self.cipher = AES.new(Config().encryptionKey, AES.MODE_CBC,
                                  self.iv)
        self.instantiated = True
示例#7
0
 def hashedWordCount(self, plainWordCount, results):
     b64enc = base64.b64encode
     hashedWordCount = collections.defaultdict(int)
     salt = str(Config().database)
     for word, count in plainWordCount:
         key = b64enc(
             scrypt.hash(str(word), salt, N=1 << self.scryptHashFactor))
         hashedWordCount[key] = count
     results += dict(hashedWordCount).items()
示例#8
0
 def __loopTermFrequencies(self, termFrequencies, tfidf, documentCount,
                           wordCountsByKey):
     # This function is paralellized due to the large amount of words and long CPU times.
     for word, (count, tf) in termFrequencies:
         key = base64.b64encode(
             scrypt.hash(str(word),
                         str(Config().database),
                         N=1 << self.scryptHashFactor))
         idf = math.log(
             float(documentCount) / (1 + wordCountsByKey.get(key, 0)))
         tfidf[word] = (idf * tf, key)
示例#9
0
    def setPassword(self, password):
        hashedPassword = bcrypt.hashpw(
            password.encode('utf-8'),
            bcrypt.gensalt(Config().bcryptDifficulty))

        if self.checkPassword(password):
            return False

        self.password = hashedPassword
        self.createUserKeyPair(password)
        del password
        return True
示例#10
0
    def __encryptDocument(self, document):
        """ Encrypts a dictionary/document for before storage. Skips the ID field for retreival purposes. Field names are left intact.
		Structural information will also be maintained.
		
:param document: The document that should be encrypted. This is a normal MongoDB document (dictionary).
:returns: The MongoDB document with encrypted, base64 encoded values.
		"""
        if isinstance(document, list):
            encryptedDocument = []

            for value in document:
                if isinstance(value, list) or isinstance(value, dict):
                    encryptedDocument.append(self.__encryptDocument(value))
                    continue

                cipher = AES.new(Config().encryptionKey, AES.MODE_CBC, self.iv)
                paddedValue = self.__padObject(value)
                encryptedValue = cipher.encrypt(paddedValue)
                encryptedDocument.append(base64.b64encode(encryptedValue))

        elif isinstance(document, dict):
            encryptedDocument = {}
            for key, value in document.items():
                if key == "_id":  # Do not encrypt _id field.
                    encryptedDocument["_id"] = value
                    continue

                if isinstance(value, list) or isinstance(value, dict):
                    encryptedDocument[key] = self.__encryptDocument(value)
                    continue

                cipher = AES.new(Config().encryptionKey, AES.MODE_CBC, self.iv)
                paddedValue = self.__padObject(value)
                encryptedValue = cipher.encrypt(paddedValue)
                encryptedDocument[key] = base64.b64encode(encryptedValue)

        else:
            raise ValueError, "Not a valid encryptable object."

        return encryptedDocument
示例#11
0
    def translate(self, key, language="default"):
        """ Takes a phrase and translates it to a given language based on the internationalization file. """
        localizationPath = os.path.join(Config().WebAppDirectory, "../..",
                                        "localization.json")
        if self.localizationDict is None:
            with open(localizationPath, "r") as ld:
                self.localizationDict = dict(json.load(ld))

        if language == "default":
            language = Config().language

        if language == "en_US":
            return key

        translation = self.localizationDict.get(language,
                                                {}).get(key, key + "**")
        if translation.endswith("**") and language in self.localizationDict:
            self.localizationDict[language][key + "**"] = "NEEDSTRANSLATION"
            with open(localizationPath, "w") as ld:
                json.dump(self.localizationDict, ld, indent=4)

        return translation
示例#12
0
    def generateJurisprudenceDocuments(documentClass=None):
        print documentClass
        if documentClass is None:
            documentClass = Document
        """ This method needs to be changed to use the new mongo sample dump instead of the old jurisprudence.json. """
        return False
        jurisprudence = json.load(
            open(
                os.path.join(Config().WebAppDirectory, "..", "..",
                             "jurisprudence.json"), "r"))

        stopwords = Document.getStopwords()

        categorized = {}
        wiki = jurisprudence["Yuras"]["wiki"]
        flatWiki = Document.__flatten(wiki)
        dCounter = 1
        dTotal = len(flatWiki)
        for key, document in flatWiki.items():
            title = key.split("_")[-1].strip(".html").split("/")[-1].replace(
                "-", " ")
            title = title[0].upper() + title[1:]
            key = key.split("_")[0]

            if key not in categorized:
                categorized[key] = []
                if len(Category().getObjectsByKey("name", key)) == 0:
                    c = Category()
                    c.name = key
                    c.save()

            if len(documentClass().getObjectsByKey("title", title)) == 0:
                print documentClass
                d = documentClass()
                print d
                print "Encrypting:", d._encrypt
                d.title = title
                d.category = key
                d.contents = Pandoc().convert("html", "markdown_github",
                                              document)
                d.author = "Yuras"
                d.document_type = "jurisprudence"
                if getattr(d, "_encrypt", True):
                    d.countWords()

                d.vanillaSave()

            categorized[key].append(document)
            print title, "saved", dCounter, "/", dTotal
            dCounter += 1
示例#13
0
    def getArticlesFromContent(self):
        articleRegexString = "([Aa]rtikel(en)?|art\.) ((\d+|[IVX]+)([:\.]\d+)?([a-z]+)?( en |, ?)?)+ ?(([etdvzan][ewic][a-z]+?((d|st)e[^a-z]))?(lid|paragraaf|volzin)( \d+)?([, ])*)*((van|het|de|Wet|wet) )*( ?(([A-Z]([A-Z]{1,4}|[a-z]{1,2}))[^\w]) ?(\d{4})?|([\w\-]+ ?)+ ?(\d{4})?)"
        articleRegex = re.compile(articleRegexString)

        results = set()

        for result in articleRegex.finditer(self.contents):
            results.add(result.group(0).strip(",. "))

        with open(
                os.path.join(Config().WebAppDirectory, "..", "..",
                             "wetboeken.csv")) as wetboekencsv:
            wetboeken = [
                wb.strip(";").split(";")
                for wb in wetboekencsv.read().split("\n")
            ]

        filteredArticles = []
        filterwords = ["onder", "lid", "alinea"]
        for result in results:
            suffix = [g for g in result if g is not None][-2]
            if True not in [f in suffix for f in filterwords
                            ] and suffix not in list("0123456789"):
                found = False

                if suffix == "deze wet":
                    articleName = " ".join([result[0], result[3], wetboek[0]])
                    print articleName
                    filteredArticles.append(articleName)
                    continue

                for wetboek in wetboeken:
                    if found:
                        break
                    for wb in wetboek:
                        if wb in suffix:
                            found = True
                            articleName = " ".join(
                                [result[0],
                                 str(result[2]), wetboek[0]])
                            print articleName
                            filteredArticles.append(articleName)

        self.articles = filteredArticles
        super(Document, self).save()

        return filteredArticles
示例#14
0
    def __init__(self, database=None, collection=None, name=""):
        """ Sets up the object
		
		:param database: Optional, the database where this object is to be stored. Defaults to the database stored in Config.
		:param collection: Optional, the collecion where this object is to be stored.
		:param name: The pretty name of this object.
		"""
        if database is None:
            self._database = Config().database
        else:
            self._database = database
        self._collection = collection
        self.__storage = Storage()
        self.name = name
        self._created = datetime.datetime.now()
        self._type = self.__class__.__name__
        if not hasattr(self, "_encrypt"):
            self._encrypt = True
示例#15
0
    def search(query):
        """ Returns the results, the query that was used by the QueryEngine and the facets """
        matchedObjects = {}

        for word in words:
            key = base64.b64encode(
                scrypt.hash(str(word),
                            str(Config().database),
                            N=1 << self.scryptHashFactor))
            matchedObjects["tags." + key] = {"$exists": True}

        results = self.matchObjects(matchedObjects,
                                    fields={
                                        "_id": True,
                                        "title": True
                                    })
        for result in results:
            result.markedWords = "No highlighting available for secured documents."
        return results, query, {}
示例#16
0
    def quickSearch(words):
        matchedObjects = {}

        for word in words:
            key = base64.b64encode(
                scrypt.hash(str(word),
                            str(Config().database),
                            N=1 << self.scryptHashFactor))
            matchedObjects["tags." + key] = {"$exists": True}

        results = self.matchObjects(matchedObjects,
                                    fields={
                                        "_id": True,
                                        "title": True
                                    })

        return json.dumps(
            dict([(str(document._id), document.title)
                  for document in results]))
示例#17
0
    def trainQueryEngine(self):
        qe = QueryEngine()

        if qe.SpellingEngine is None and self.TRAIN_QE:
            print "Training SpellingEngine with up to %s documents." % self.TRAIN_QE_SIZE
            se = SpellingEngine()
            se.model = se.trainWithDatabaseDocuments(limit=self.TRAIN_QE_SIZE)
            qe.SpellingEngine = se
        else:
            print "Not training SpellingEngine."

        if qe.ThesaurusEngine is None:
            print "Training ThesaurusEngine."
            te = ThesaurusEngine()
            with open(
                    os.path.join(Config().WebAppDirectory, "..", "..",
                                 "thesaurus.txt")) as thesaurusFile:
                thesaurus = thesaurusFile.read()
            te.parseOpentaalThesaurus(thesaurus)
            qe.ThesaurusEngine = te
示例#18
0
    def matchObjects(self,
                     match,
                     limit=None,
                     skip=0,
                     fields=None,
                     sort=None,
                     reverse=False):
        """ This method allows you to match a StoredOject directly. It allows for more advanced queries.
		
		:param match: A query dictionary.
		:param limit: The maximum amount of objects to return. Will return all results by default.
		:param skip: The amount of objects to skip, basically an offset.
		:param fields: The fields to return for this object.
		:param sort: The documents are sorted by the given indices. This will be slower on encrypted documents, as they are sorted in Python instead of in the database.
		:param reverse: Whether or not documents are returned in reverse. False by default.
		:rtype: All the matching objects stored in the database.
		"""

        if fields is None:
            fields = {}
        storage = self.__storage
        database = self._database
        collection = self._collection

        if database is None or collection is None:
            raise ValueError, "The object needs to be assigned a database and a collection."

        storage.getDatabase(database)
        storage.getCollection(collection)

        # Always try to add the _encrypt key, to ensure unencrypted documents won't have an attempted decryption
        if 0 not in fields.values() and False not in fields.values(
        ) and len(fields) > 0:
            fields["_encrypt"] = True

        if getattr(self, "_encrypt", True) and Config().encryptDocuments:
            sortDecrypted = True
            documents = storage.getDocuments(match,
                                             limit,
                                             skip,
                                             fields,
                                             sort=None)
        else:
            sortDecrypted = False
            documents = storage.getDocuments(match,
                                             limit,
                                             skip,
                                             fields,
                                             sort=sort,
                                             _encrypted=False)

        if sort is not None and sortDecrypted:
            documents.sort(key=lambda d: self.__multi_get(d, sort, default=""),
                           reverse=reverse)
        else:
            if reverse:
                documents.reverse()

        objects = [self.loadFromRawData(data) for data in documents]

        return objects
示例#19
0
 def storeWebAppDirectories(self):
     """ Stores the webapp directories into the config. """
     Config().WebAppDirectory = os.path.join(os.getcwd())
     Config().TemplatesDirectory = os.path.join(os.getcwd(), "templates")
     Config().save()
示例#20
0
def test_storeWebAppDirectories():
    s = Server()
    s.storeWebAppDirectories()
    assert hasattr(Config(), "RootDirectory")
示例#21
0
 def getStopwords():
     with open(
             os.path.join(Config().WebAppDirectory, "../..",
                          "stopwords.txt"), "r") as swf:
         stopwords = swf.read().split("\n")
     return stopwords
示例#22
0
 def getMostCommonPasswords(n=10000):
     with open(
             os.path.join(Config().WebAppDirectory, "../..",
                          "common-passwords.txt"), "r") as cpw:
         passwords = cpw.read().split("\n")
     return passwords[:n]
示例#23
0
	def __init__(self):
		self.rechtspraakFolder = Config().WebAppDirectory+"/../../rechtspraak"
		self.jsonFileName = "10krecords.json"
		self.delimitor = re.compile("[^a-z]*")
		self.articleRegex = re.compile(r"([Aa]rtikel(en)?|art\.) ((\d+|[IVX]+)([:\.]\d+)?([a-z]+)?( en |, ?)?)+ ?(([etdvzan][ewic][a-z]+?((d|st)e[^a-z]))?(lid|paragraaf|volzin)( \d+)?([, ])*)*((van|het|de|Wet|wet) )*( ?(([A-Z]([A-Z]{1,4}|[a-z]{1,2}))[^\w]) ?(\d{4})?|([\w\-]+ ?)+ ?(\d{4})?)")	
示例#24
0
 def __resetCipher(self):
     """ This resets the cipher to a new AES instance, foregoing all the PyCrypto rituals and Python instantion overhead."""
     c = self.cipher
     c._cipher = _AES.new(Config().encryptionKey, AES.MODE_CBC, self.iv)
     return c