def add_collection(self): our_cache = self.our_cache coll_address = str(uuid.uuid1()) doc_hash_1 = str(uuid.uuid1()) doc_hash_2 = str(uuid.uuid1()) coll = Collection( title="Test", description="This is a collection!", address=coll_address, btc="123456789", keywords=[ ], documents=[ Document( description="Test document A", hash=doc_hash_1, title="Test A", ), Document( description="Test document B", hash=doc_hash_2, title="Test B", ), ], creation_date=datetime.datetime.now(), oldest_date=datetime.datetime.now(), latest_broadcast_date=datetime.datetime.now() ) our_cache.insert_new_collection(coll) collections.update_hash(coll) our_cache.session.commit() return coll
def remove_item_dict2_test(self): controller = DocumentListController() document = Document() document2 = Document() controller.add(document) controller.add(document2) controller.remove(document) self.assertEquals(1, len(controller.association)) self.assertTrue(document2 in controller.association.values()) self.assertFalse(document in controller.association.values())
def setUp(self): self.controller = Controller() self.cache = self.controller.cache self.address = 'ffafaf' coll_address = str(uuid.uuid1()) doc_hash_1 = str(uuid.uuid1()) doc_hash_2 = str(uuid.uuid1()) doc_hash_3 = str(uuid.uuid1()) self.test_collection_evil = Collection( title="Test multiple33333", description="This is a collection! with multiple docs222", address=self.address, btc="123456789", keywords=[ Keyword(name="Keyword A", id=1199), Keyword(name="Keyword c", id=1214), ], documents=[ Document( description="Test document Z", hash="zzzzzzzz", title="Test Z", accesses=0, filename="joe.txt", collection_address="BM-2cSrapXpgDTFD8AyDmU1BGifNkB2Z6X9k8" ), Document( description="Test document B", hash='gdssgsdg', title="Test B", accesses=3, filename="gile.txt", collection_address="BM-2cSrapXpgDTFD8AyDmU1BGifNkB2Z6X9k8" ), Document( description="Test document Bddd", hash='afff', title="Test B", accesses=3, filename="gile.txt", collection_address="BM-2cSrapXpgDTFD8AyDmU1BGifNkB2Z6X9k8" ), ], creation_date=datetime.datetime.now(), oldest_date=datetime.datetime.now(), latest_broadcast_date=datetime.datetime.now(), latest_btc_tx="btctx1", oldest_btc_tx="btctx12", accesses=2, votes=3, votes_last_checked=datetime.datetime.now())
def generate_documents(self, number) -> List[Document]: current_id = 1 documents = [] # create 6 NOT_RELEVANT documents for i in range(number): documents.append(Document(current_id, 0))#Relevance.NOT_RELEVANT)) current_id += 1 # create 6 RELEVANT documents for i in range(number): documents.append(Document(current_id, 1))#Relevance.RELEVANT)) current_id += 1 return documents
def _build_docs_keywords(self, payload, collection): """ Builds a list of Keyword objects and a list of Document objects from the received json. :param payload: The payload of the FJ Message including the documents and keywords :return: Two lists representing the documents and keywords of the FJ Message """ for key in payload["keywords"]: db_key = self.cache.get_keyword_by_id(key["id"]) if db_key is not None: collection.keywords.append(db_key) else: collection.keywords.append(Keyword(name=key["name"])) for doc in payload["documents"]: db_doc = self.cache.get_document_by_hash(doc["hash"]) if db_doc is not None: collection.documents.append(db_doc) else: collection.documents.append( Document(collection_address=doc["address"], description=doc["description"], hash=doc["hash"], title=doc["title"], filename=doc["filename"], accesses=doc["accesses"]))
def parse_invoices(entries, account): for entry in entries: i = entry['invoices'] invoice = db.query(Document).filter_by(uid=i['id']).first() partner = db.query(Partner).filter_by(uid=i['client_id']).first() if invoice is None: invoice = Document(uid=i['id'], account=account, tags=[typetags['invoice']]) db.add(invoice) else: u = list(set(invoice.tags).difference(set(tags.values()))) if u: invoice.tags = u invoice.name = i['subject'] invoice.value = i['amount'] invoice.date = datetime.strptime(i['issued_at'], '%Y-%m-%d') invoice.updated_at = datetime.strptime(i['updated_at'], '%Y-%m-%dT%H:%M:%SZ') invoice.meta = json.dumps(i) invoice.partner = partner invoice.tags.append(tags[i['state']]) if i['state'] == 'open' and datetime.strptime( i['due_at'], '%Y-%m-%d') < datetime.now(): invoice.tags.append(tags['due'])
def load_docs(self): """ Aims at loading all the collection's documents (processed) in the collection instance. """ pickle_path = f"pickle/{self.name}_docs.p" try: self.documents = load(open(pickle_path, "rb")) self.number_of_docs = len(self.documents) except FileNotFoundError: number_document_loaded = 0 for id_directory in range(10): print(f"Loading directory {id_directory}") path_directory = self.path_to_corpus + str(id_directory) for text_file in listdir(path_directory): # create a document instance document = Document( id_doc=number_document_loaded, id_folder=id_directory, address=text_file, ) # load data and process documents (filter, remove stopwords and lemmatize) document.get_content(self.path_to_corpus) document.process_document(stopwords_list=self.stopwords, lemmatizer=self.lemmatizer) self.documents.append(document) number_document_loaded += 1 makedirs(path.dirname(pickle_path), exist_ok=True) dump(self.documents, open(pickle_path, "wb")) self.number_of_docs = number_document_loaded
def parse_document(record, account, typestring): document = db.query(Document).filter_by(uid=record['uid']).first() partner = get_or_create(Partner, name=record['meta']['client_name']) if document is None: document = Document(**record) document.account = account document.tags = [typetags[typestring]] db.add(document) else: u = list(set(document.tags).difference(set(tags.values()))) if u: document.tags = u document.name = record['name'] document.value = record['value'] document.date = record['date'] document.updated_at = datetime.now() document.meta = CustomJSONEncoder().encode(record['meta']) document.partner = partner document.tags.append(tags[record['meta']['state']]) if record['meta'][ 'state'] == 'open' and record['meta']['due_at'] < datetime.now(): document.tags.append(tags['due'])
def convert_to_objects(a_paths, corpus, encoding, train_size): docs = [] for path in a_paths[:train_size]: if ('MADE-1.0' in corpus): e_list, r_list = parse_xml(path, encoding) kwargs_for_doc = { 'entities': e_list, 'references': r_list, 'annotation_path': path, 'text_path': path.replace('annotations', 'corpus').replace('.bioc.xml', ''), } elif ('corpus_release' in corpus): e_list, r_list = parse_brat(path, encoding) kwargs_for_doc = { 'entities': e_list, 'references': r_list, 'annotation_path': path, 'text_path': path.replace('ann', 'txt'), } fictive_relations = get_fictive_relations( e_list, r_list, kwargs_for_doc.get('text_path'), encoding) kwargs_for_doc.update({'references': r_list + fictive_relations}) docs.append(Document(**kwargs_for_doc)) return docs
def add_collection(): global our_cache coll_address = str(uuid.uuid1()) doc_hash_1 = str(uuid.uuid1()) doc_hash_2 = str(uuid.uuid1()) coll = Collection( title="Test", description="This is a collection!", address=str(uuid.uuid1()), btc=str(uuid.uuid1()), keywords=[ ], documents=[ Document( collection_address=doc_hash_1, description="Test document A", hash=str(uuid.uuid1()), title="Test A", ), ], creation_date=datetime.datetime.now(), oldest_date=datetime.datetime.now(), latest_broadcast_date=datetime.datetime.now() ) our_cache.insert_new_collection(coll) collections.update_hash(coll) our_cache.session.commit() return coll
def put_document(file_path, collection_address, title, description): """ Insert a document into the local cache with associated information and upload the document to the freenet network. :param file_path: the path of the file to upload :param collection_address: the collection address associated with the document :param title: the title of the document being uploaded :param description: the description of the document being uploaded """ file_name = os.path.basename(file_path) contents = open(file_path).read() freeCon = FreenetConnection() uri = freeCon.put(contents) name, extension = os.path.splitext(file_name) hash_name = uri new_file_name = hash_name + extension shutil.copy(file_path, os.path.expanduser(config.DOCUMENT_DIRECTORY_PATH) + new_file_name) document = Document( collection_address = collection_address, description = description, hash = uri, title = title, filename = new_file_name, accesses = 0 ) cache.insert_new_document(document) collection = cache.get_collection_with_address(collection_address) collections.update_hash(collection) print ("Inserted " + file_path + " successfully with URI " + uri) print ("Allow up to 10 minutes for file to propogate on the freenet network")
def search(self, query) -> Documents: title_boost = 'title^' + str(ELASTIC_TITLE_BOOST) text_boost = 'text^' + str(ELASTIC_TEXT_BOOST) s = Search(using=self.client, index=INDEX_NAME) \ .query("multi_match", query=query, fields=[title_boost, text_boost]) response = s.execute() table = PrettyTable(['Index', 'Title', 'Score', 'Popularity']) docs = Documents() skip_count = 0 for idx, doc in enumerate(response): if not any(excl in doc.title for excl in TITLE_EXCLUDES) and \ not any(excl in doc.category for excl in CAT_EXCL) and \ REFER_TEXT not in doc.text: # uft encode values doc.title = str.encode(doc.title, encoding='utf-8').decode(encoding='utf-8') doc.text = str.encode(doc.text, encoding='utf-8').decode(encoding='utf-8') docs.add(Document(doc.title, doc.text, doc.meta.score, idx)) format_num = lambda x: '{0:.2f}'.format(x) table.add_row([idx, doc.title, format_num(doc.meta.score), doc.popularity_score]) else: skip_count += 1 Logger.info('Elastic result:\n' + str(table)) Logger.info(str(skip_count) + ' elastic results were skipped') return docs
def remove_item_dict_test(self): controller = DocumentListController() document = Document() controller.add(document) controller.remove(document) self.assertEquals(0, len(controller.association))
def get_item_from_document_test(self): controller = DocumentListController() document = Document() controller.add(document) item = controller.get_item_from_document(document) self.assertEquals(document, controller.association[item])
def test_open_blank_file(self): document = Document() text_file = open("test_file", "w") text_file.close() document.open("test_file") self.assertEquals("test_file", document.path) self.assertEquals("", document.text)
def setUp(self): self.controller = Controller() self.address = self.controller.connection.create_address('Controller Test address', True) coll_address = str(uuid.uuid1()) doc_hash_1 = str(uuid.uuid1()) doc_hash_2 = str(uuid.uuid1()) doc_hash_3 = str(uuid.uuid1()) self.test_collection = Collection( title="Test", description="This is a collection!", address=self.address, btc="123456789", keywords=[ Keyword(name="Keyword A"), Keyword(name="Keyword B"), ], documents=[ Document( description="Test document A", hash=doc_hash_1, title="Test A", accesses=0, filename="joe.txt", collection_address="afgagahhsgh" ), Document( description="Test document B", hash=doc_hash_2, title="Test B", accesses=3, filename="gile.txt", collection_address="afgagasghhhss" ), ], creation_date=datetime.datetime.now(), oldest_date=datetime.datetime.now(), latest_broadcast_date=datetime.datetime.now(), latest_btc_tx="btctx1", oldest_btc_tx="btctx12", accesses=2, votes=3, votes_last_checked=datetime.datetime.now() ) self.test_signature = Signature(pubkey='itsakey',address=self.address)
def test_text_save_file(self): document = Document() document.text = "this is only a test of save file" document.path = "test_file" document.save() text_file = open(document.path, "r") self.assertEquals(document.text, text_file.read())
def test_save_inexistent_file(self): document = Document() document.text = "this is only a test of save file" document.path = "test_file" document.save() self.assertTrue(os.path.exists(document.path)) self.assertEquals("this is only a test of save file", document.text)
def test_open_text_file(self): document = Document() text_file = open("test_file", "w") text_file.write("this is only a test") text_file.close() document.open("test_file") self.assertEquals("test_file", document.path) self.assertEquals("this is only a test", document.text)
def add_dict_test(self): controller = DocumentListController() document = Document() controller.add(document) self.assertEquals(document, controller.association.values()[0]) self.assertTrue( type(controller.association.keys()[0]) == QtGui.QStandardItem)
def add_item_name_test(self): controller = DocumentListController() document = Document() document.path = "/path/to/test.tf" controller.add(document) item = controller.association.keys()[0] self.assertEquals("test.tf", item.text())
def remove_item_return_test(self): controller = DocumentListController() document = Document() controller.add(document) document_item = controller.association.items()[0][0] removed_item = controller.remove(document) self.assertEquals(document_item, removed_item)
def test__get_revision_by_timestamp(self): timestamp = datetime(2020, 1, 1, 1, 1, 1) revision = Revision(id=1, content='hello', timestamp=timestamp, document_id=5) document = Document(id=5, title='blah', revisions=[revision]) assert document.get_revision_by_timestamp(timestamp) == revision
def test_different_root_hash(self): d = Document( description="Test document A", hash="asdfasdfa;sldkfja;sldkfja;dljkfa;ldf", collection_address="bm-first", title="Test A", ) d2 = Document( description="Test document B", hash="fdasdfsdfsdfsdfsdfsdfsdfdfsdfsddfdfdf", collection_address="bm-first", title="Test B", ) self.cache.insert_new_document(d) collections.update_hash(self.collection1) self.cache.insert_new_document(d2) collections.update_hash(self.collection1) versions = self.cache.get_versions_for_collection( self.collection1.address) self.assertTrue(versions[0].root_hash != versions[1].root_hash)
def change_filename_test(self): controller = DocumentListController() document = Document() document.path = "/path/to/othertest.tf" controller.add(document) item = controller.association.keys()[0] controller.change_filename(document, "/new/file/name.tf") self.assertEquals("name.tf", item.text())
def test_replace_text_save_file(self): text_file = open("test_file", "w") text_file.write("this is only a test file") text_file.close() document = Document() document.text = "I changed the text" document.path = "test_file" document.save() text_file = open("test_file", "r") self.assertEquals(document.text, text_file.read())
def load(self): """ Loads Document from specified file """ with open(self.file_name, "r", encoding='utf8') as file: json_doc = json.loads(file.read()) self.language = json_doc["metadata"]["language"] features = [ Feature(feature['type'], feature['words'], feature['context'], self.__letters_to_int(feature['letters']), feature['transcription']) for feature in json_doc["features"] ] chapters = self.__load_chapters(json_doc["text"]) stop_words = json_doc["stop_words"] return Document(chapters, self.language, features, stop_words)
def test_two_doc_insert(self): d = Document( description="Test document A", hash="asdfasdfa;sldkfja;sldkfja;dljkfa;ldf", collection_address="bm-first", title="Test A", ) d2 = Document( description="Test document B", hash="fdasdfsdfsdfsdfsdfsdfsdfdfsdfsddfdfdf", collection_address="bm-first", title="Test B", ) self.cache.insert_new_document(d) collections.update_hash(self.collection1) self.cache.insert_new_document(d2) collections.update_hash(self.collection1) versions = self.cache.get_versions_for_collection( self.collection1.address) if (len(versions) < 2): print(len(versions)) self.fail("No new version was created") self.assertTrue(len(versions) == 2)
def build_document(self, data_dict: dict) -> None: """ Build a Document from the data passed in. :param data_dict: dict """ self._validate_data(data_dict) document = self.document_repository.get_by_title(data_dict['title']) if document: new_revision = Revision(content=data_dict['content']) document.revisions.append(new_revision) self.document_repository.save(document) else: revision = Revision(content=data_dict['content']) document = Document(title=data_dict['title'], revisions=[revision]) self.document_repository.save(document)
def test(self): global our_cache with our_cache.session.no_autoflush: try: d = Document( description=str(uuid.uuid4()), hash=str(uuid.uuid4()), collection_address=test_coll.address, title=str(uuid.uuid4()), ) our_cache.insert_new_collection(test_coll) our_cache.insert_new_document_in_collection(d, test_coll) except: # Test already ran return True collections.update_hash(test_coll) curr_value = test_coll.get_latest_collection_version().root_hash self.assertNotEqual(str(curr_value),prev_value)