def init(self): """ 初始化proj,即新建目录,并在该目录下新建明文和密文目录 :return: None """ def init_action(): if os.path.isdir(self.proj_name): # 如果已经存在项目目录,就需要递归删除该目录 # os.removedirs(self.proj_name) shutil.rmtree(self.proj_name) os.mkdir(self.proj_name) os.mkdir(self.proj_dir_path + 'plain_text') os.mkdir(self.proj_dir_path + 'cipher_text') # 接下来,还要保存配置文件 with open(self.proj_dir_path + 'config', 'wb') as f: pickle.dump([self.k, self.l, self.s, self.file_cnt], f) if os.path.isdir(self.proj_name): printer.print_warning("发现已经存在同名目录,是否需要清除该目录下所有内容? (Y/N)") ok = input() if ok == 'Y' or ok == 'y': printer.print_info("正在清空并初始化中...") init_action() printer.print_success("清空完成!") else: printer.print_info("用户已拒绝操作,程序退出...") return else: printer.print_info("正在初始化项目中...") init_action() printer.print_success("初始化项目完成!")
def upload(self): """ 上传密文、加密后的索引、配置文件到服务器上 :return: """ def upload_action(): return upload_manager.upload_to_server(self.proj_name, 'Y') def delete_local_cipher(): """ 如果上传完毕,删除本地上的密文集和加密索引 :return: """ shutil.rmtree(self.proj_dir_path + 'cipher_text') os.remove(self.proj_dir_path + 'index.enc') if 2 <= get_status_by_bits(self.status_bits) < 6: printer.print_error('操作失败,理由: ') self.status() return res = upload_action() if res != 'success': printer.print_error('上传失败!服务器返回信息如下:') printer.print_error(res) else: printer.print_success('上传成功!') delete_local_cipher()
def run_selector_parallel_min(partitions, n_partition): train_data_path = root_path + 'train-set_all.csv' authors_path = root_path + 'usable_authors_to_train_val.npy' output_path = root_path + f'train_positive/partition_{n_partition}.csv' selector = ParallelSelectorMin(train_data_path, authors_path, output_path, partitions, n_partition, None) printer.print_success('START') selector.run() printer.print_success('FINISHED')
def _load_data(self, train_data_path): if os.path.exists(train_data_path): self.train_data_df_top_comments = pd.read_csv(train_data_path)[[ 'timestamp', 'author_id', 'comment_id', 'article_id', 'parent_comment_id' ]] printer.print_success('Input Data loaded') else: printer.print_error('Train data path does not exist!')
def run_selector_parallel_min_validation(partitions, n_partition): val_data_path = root_path + 'validation-set_all.csv' authors_path = root_path + 'usable_authors_validation.npy' output_path = root_path + f'val_positive/partition-{n_partition}.csv' selector = ParallelSelectorMin(val_data_path, authors_path, output_path, partitions, n_partition) printer.print_success('START') selector.run() printer.print_success('FINISHED')
def run_selector_negative_paralallel_validation(partitions, n_partition): val_data_path = root_path + 'validation-set_all.csv' authors_path = root_path + 'usable_authors_validation.npy' article_dates_path = root_path + 'c_articles_dates.csv' output_path = root_path + f'val_negative/partition-{n_partition}.csv' selector = NegativeExampleSelector(val_data_path, authors_path, article_dates_path, output_path, partitions, n_partition, 1, False, False) printer.print_success('START') selector.run() printer.print_success('FINISHED')
def run_selector_parallel_min_test(partitions, n_partition): # train_data_path, authors_path, output_path, partitions, n_partition test_data_path = root_path + 'test-set_all.csv' authors_path = root_path + 'usable_authors_test.npy' output_path = root_path + f'test_positive/partition-{n_partition}.csv' selector = ParallelSelectorMin(test_data_path, authors_path, output_path, partitions, n_partition, random_sample=False) printer.print_success('START') selector.run() printer.print_success('FINISHED')
def generate_keys_action(): k1, k2, k3, k4 = self.gen() print('========THE KEY========') print('{}\n{}\n{}\n{}'.format(base64.b64encode(k1).decode(encoding='UTF-8'), base64.b64encode(k2).decode(encoding='UTF-8'), base64.b64encode(k3).decode(encoding='UTF-8'), base64.b64encode(k4).decode(encoding='UTF-8'))) print('========THE KEY========') # 保存密钥 self.save_keys() printer.print_success('密钥文件已保存至本地.')
def run_selector_negative_paralallel_test(partitions, n_partition): test_path = root_path + 'test-set_all.csv' authors_path = root_path + 'usable_authors_test.npy' article_dates_path = root_path + 'c_articles_dates.csv' output_path = root_path + f'test_negative/partition-{n_partition}.csv' np.random.seed(123) selector = NegativeExampleSelector(test_path, authors_path, article_dates_path, output_path, partitions, n_partition, 50, True, False) printer.print_success('START') selector.run() printer.print_success('FINISHED')
def run_selector_negative_paralallel(partitions, n_partition): data_path = root_path + 'train-set_all.csv' authors_path = root_path + 'usable_authors_to_train_val.npy' article_dates_path = root_path + 'c_articles_dates.csv' output_path = root_path + f'train_negative/partition-{n_partition}.csv' k = 1 np.random.seed(123) selector = NegativeExampleSelector(data_path, authors_path, article_dates_path, output_path, partitions, n_partition, k) printer.print_success('START') selector.run() printer.print_success('FINISHED')
def _load_data(self): self.article_dates_df = pd.read_csv(self.article_dates_path) self.article_dates_df['timestamp'] = pd.to_datetime( self.article_dates_df['date']) if os.path.exists(self.data_path): self.data_df = pd.read_csv(self.data_path)[[ 'timestamp', 'author_id', 'comment_id', 'article_id', 'parent_comment_id' ]] printer.print_success('Input Data loaded ') else: printer.print_error('Train data path does not exist!')
def encrypt_action(): printer.print_info('检查明文目录下文件名格式是否符合要求...') if not scanner.check_filename_format(self.proj_dir_path): printer.print_info('不符合文件命名格式,请问是否需要执行自动格式化文件名操作? (Y/N)') ok = input() if ok == 'y' or ok == 'Y': scanner.reformat_filename(self.proj_dir_path) printer.print_success('格式化文件名成功!') else: printer.print_error('软件终止...请自行更改文件名以满足要求!') else: printer.print_success('检查完毕,文件名符合要求!') printer.print_info('开始加密索引和文档...') self.enc() self.save_encrypted_index() # 记得保存索引 printer.print_success('加密索引和文档成功')
def tokenize_texts(input_path, output_path): tokenizer = CommentTokenizer(input_path, output_path) printer.print_success('START') tokenizer.run() printer.print_success('FINISHED')
def make_tf_idf(self): printer.print_progress('Run TFIDF Model') self.model = TfidfModel(self.corpus, normalize=False) printer.print_success('Finished to create corpus')
def enc(self): def initialization(): # step1. scan D and generate the set of distinct keywords δ(D) self.distinct_word_set = scanner.generate_the_set_of_distinct_keywords_for_docs(self.proj_dir_path)[1] # step2. for all w ∈ δ(D), generate D(w) self.D_ = scanner.generate_Dw_for_each_keyword(self.proj_dir_path) # step3. initialize a global counter ctr = 1 ---> see __init__() def building_the_array_A(): # step4. for 1<=i<=|δ(D)|, build a list Li with nodes Ni,j and store it in array A as follows: for i in range(1, len(self.distinct_word_set) + 1): keyword = self.distinct_word_set[i - 1] # 在这里注意论文中的i和程序中的i不同,应当减一 Ki = [None] * (len(self.D_[keyword]) + 1) Ni = [None] * (len(self.D_[keyword]) + 1) # sample a key Ki,0 <-$- {0, 1}^k Ki[0] = Random.new().read(int(self.k / 8)) self.k0_for_each_keyword[keyword] = Ki[0] # for 1<=j<=|D(wi)|-1 j = 0 for j in range(1, len(self.D_[keyword])): # let id(Di,j) be the jth identifier in D(wi) id_Dij = self.D_[keyword][j - 1] # todo # generate a key Ki,j <- SKE1.Gen(1^k) Ki[j] = Random.new().read(int(self.k / 8)) # if j == 1: # self.k0_for_each_keyword[keyword] = Ki[j] # Ni[j] = str(id_Dij) + "|||" + str(Ki[j]) + "|||" + self.mu(Ki[j - 1], Ni[j]) Ni[j] = id_Dij.to_bytes(self.file_cnt_byte, byteorder="big") + Ki[j] + self.mu(self.k1, num2byte( self.ctr + 1, int(self.s / 8))) index = self.mu(self.k1, num2byte(self.ctr, int(self.s / 8))) if j == 1: self.addrA[keyword] = index # 保存头节点的地址到dict里面去 index = int.from_bytes(index, byteorder="big") self.A[index] = self.SKEEnc(Ki[j - 1], Ni[j]) if self.entry_size_of_A == -1: self.entry_size_of_A = len(self.A[index]) self.ctr += 1 # for the last node of Li # set the address of the next node to NULL: Ni,|D(wi)| = <id(Di,|D(wi)|) || 0^k || NULL> j += 1 # ... id_Dij = self.D_[keyword][len(self.D_[keyword]) - 1] Ni[len(self.D_[keyword])] = id_Dij.to_bytes(self.file_cnt_byte, byteorder="big") + b"\x00" * int( self.k / 8) + b"\x00" * int(math.ceil(self.s / 8)) # todo index = self.mu(self.k1, num2byte(self.ctr, int(self.s / 8))) if j == 1: self.addrA[keyword] = index # 保存头节点的地址到dict里面去 # self.k0_for_each_keyword[keyword] = Ki[j] index = int.from_bytes(index, byteorder="big") self.A[index] = self.SKEEnc(Ki[j - 1], Ni[len(self.D_[keyword])]) # encrypt the node Ni,|D(wi)| under the key Ki,|D(wi)-1| and store it in A self.ctr += 1 # step5. set the remaining s - s' entries of A to random values of the same size # as the existing s' entries of A for i in range(len(self.A)): if self.A[i] is None: self.A[i] = Random.new().read(self.entry_size_of_A) def building_the_look_up_table_T(): size = -1 # size为look-up table 中元素的长度,用于第7个步骤 # step6. for all wi ∈ δ(D), set T[π_K3(wi)] = <addr_A(N_i,1 || K_i,0)> ⊕ f_K2(wi) for w in self.distinct_word_set: index = self.pi(self.k3, str2byte(w)) index = int.from_bytes(index, byteorder="big") self.T[index] = self.xor(self.addrA[w] + self.k0_for_each_keyword[w], self.f(self.k2, str2byte(w))) if size == -1: size = len(self.T[index]) # step7. if |δ(D)| < |△|, then set the remaining |△| - |δ(D)| entries of T to random values of the # same size as the existing |δ(D)| entries of T for i in range(2 ** self.l): if self.T[i] is None: self.T[i] = Random.new().read(size) def enc_docs(): # step8. for 1 <= i <= n, let ci <- SKE2.Enc_K4(Di) DIR = 'plain_text' file_count = len([name for name in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, name))]) for i in range(file_count): self.enc_doc(i, self.k4) printer.print_info('创建索引中...') initialization() printer.print_info('加密索引中...') building_the_array_A() building_the_look_up_table_T() printer.print_info('加密文档中...') enc_docs() printer.print_success('已就绪.') # step9. output return self.A, self.T
def collect_texts(raw_comments_path, selection_path, offset_path, output_path): collector = CommentSelector(raw_comments_path, selection_path, offset_path, output_path) printer.print_success('START') collector.run() printer.print_success('FINISHED')
def _save(self): df = pd.DataFrame(self.output, columns=self.output_header) df.to_csv(self.output_path, index=False) printer.print_success(f'Saved to {self.output_path}')
def make_corpus(self): for line in tqdm(self.data, total=self.number_of_lines): self.corpus.append(self.dictionary.doc2bow(line)) printer.print_success('Finished to create corpus')