if args.data_folder is None: extracted_pkl = args.extracted_pkl tensor_folder = args.tensor_folder else: extracted_pkl = join_path(args.data_folder, ['extracted_txt', 'extracted.pkl']) params_path = join_path(args.data_folder, 'params.pkl') try: params = pickle_load(params_path) except Exception: params = {} params['encode_data_test_ratio'] = args.test_ratio params['encode_data_random_seed'] = args.random_seed pickle_save(params, params_path) dataset = pickle_load(extracted_pkl) random.Random(1).shuffle(dataset) test_size = int(len(dataset) * args.test_ratio) test_set = dataset[:test_size] train_set = dataset[test_size:] train_path = join_path(args.data_folder, 'train_data') check_mkdir(train_path) print(f'Encoding training data: {len(train_set)}') train_dict_word2idx = encode_knn_dataset(train_set, train_path) test_path = join_path(args.data_folder, 'test_data') check_mkdir(test_path)
savepath = args.savepath if (checkpoint_save is None) and (combinefile is None): raise FileNotFoundError('No file path was provided') else: setup_folder(args.data_folder) checkpoint_save = join_path(args.data_folder, 'search_chunks') savepath = join_path(args.data_folder, ['extracted_txt', 'extracted.pkl']) combinefile = None try: output_list = pickle_load(combinefile) except Exception: output_list = combine_checkpoint_file(checkpoint_save) app_list = extract_app(output_list) print(f'Total number of applications: {len(app_list)}') processed_data = [] for app in tqdm(app_list): intro_text = process_intro(app[0]) claims = process_claim(app[1]) definitions = process_definition(app[2]) app_data = [intro_text, claims, definitions] if (len(claims) > 1) and (len(definitions) > 1): processed_data.append(app_data) if savepath is not None: print(f'Saving to {savepath}') pickle_save(processed_data, savepath)
parser = argparse.ArgumentParser() parser.add_argument("--extracted_pkl", default=None) parser.add_argument("--savepath", default=None) parser.add_argument("--data_folder", default=None) args = parser.parse_args() if args.data_folder is None: if args.extracted_pkl is not None: extracted_pkl = args.extracted_pkl else: raise FileNotFoundError('No file path was provided') savepath = args.savepath else: setup_folder(args.data_folder) extracted_pkl = join_path(args.data_folder, ['extracted_txt', 'extracted.pkl']) savepath = join_path(args.data_folder, ['definition', 'def_dict.pkl']) dataset = pickle_load(extracted_pkl) print(f'Total number of applications: {len(dataset)}') def_example = extract_definition(dataset) print(f'Number of unique term: {len(def_example)}') print(f'Saving to {savepath}') pickle_save(def_example, savepath)
else: raise FileNotFoundError('No file path was provided') savepath = args.savepath else: savepath = join_path(args.data_folder, ['vocab', 'vocab_tensor.pkl']) chunk_folder = join_path(args.data_folder, 'search_chunks') chunk_list = combine_checkpoint_file(chunk_folder) params_path = join_path(args.data_folder, 'params.pkl') try: params = pickle_load(params_path) except Exception: params = {} params['extract_vocab_max_length'] = args.max_length params['extract_vocab_error_word_list'] = args.error_word_list params['extract_vocabmin_freq'] = args.min_freq pickle_save(params, params_path) app_list = extract_app(chunk_list) random.shuffle(app_list) subset_size = min(len(app_list), 100) sub_list = app_list[:subset_size] print(f'Total number of applications: {len(sub_list)}') unique_word = {} word_count = {} for app in tqdm(sub_list): definition = ' '.join(app[2]) definition = set(definition.lower().split(' ')) definition = [w for w in definition if len(w) < args.max_length] for word in definition:
def encode_knn_dataset(dataset, save_path, term_pattern=r'".+?"'): dict_word2idx = {} for i, app in tqdm(enumerate(dataset)): intro = app[0] claims = ' '.join(app[1]) intro_tensor = encode_data(intro).numpy() claims_tensor = encode_data(claims).numpy() definitions = app[2] for j, def_entry in enumerate(definitions): try: term = extract_term_from_definition(def_entry) if len(term) > 1: term = term[0] term_tensor = encode_data(term).numpy() definition_tokens = text_to_word_sequence(def_entry) definition_tokens.append('[END]') preword_list = [] target_word_list = [] for i in range(4, len(definition_tokens)): preword = ' '.join(definition_tokens[:i]) target_word = definition_tokens[i] preword_list.append(preword) target_word_list.append(target_word) preword_tensors = encode_data(preword_list).numpy() context = [intro_tensor, claims_tensor, term_tensor] context_np = np.concatenate(context, axis=1) context_np = np.repeat(context_np, preword_tensors.shape[0], axis=0) keys_np = np.concatenate([context_np, preword_tensors], axis=1) word_idx_list = [] for word in target_word_list: if word not in dict_word2idx.keys(): dict_word2idx[word] = len(dict_word2idx) word_idx_list.append(dict_word2idx[word]) vals_np = np.array(word_idx_list).reshape( (len(word_idx_list), 1)) keys_fpath = join(save_path, f'{i}_{j}_keys.npy') vals_fpath = join(save_path, f'{i}_{j}_vals.npy') np.save(keys_fpath, keys_np) np.save(vals_fpath, vals_np) del keys_np del intro_tensor del claims_tensor del preword_tensors del context_np del term_tensor except Exception: continue dict_path = join(save_path, 'dict_word2idx.pkl') pickle_save(dict_word2idx, dict_path) return dict_word2idx
tensor_folder = args.tensor_folder else: extracted_pkl = join_path(args.data_folder, ['extracted_txt', 'extracted.pkl']) tensor_folder = join_path(args.data_folder, ['tensor', args.mode]) params_path = join_path(args.data_folder, 'params.pkl') check_mkdir(tensor_folder) try: params = pickle_load(params_path) except Exception: params = {} params['encode_data_test_ratio'] = args.test_ratio params['encode_data_random_seed'] = args.random_seed params['encode_data_chunk_size'] = args.chunk_size pickle_save(params, params_path) dataset = pickle_load(extracted_pkl) random.Random(1).shuffle(dataset) test_size = int(len(dataset) * args.test_ratio) test_set = dataset[:test_size] train_set = dataset[test_size:] if args.chunk_size == 0: num_chunks = len(train_set) train_chunks = chunk_doc(train_set, num_chunks) elif len(train_set) > args.chunk_size: num_chunks = int(len(train_set) / args.chunk_size) train_chunks = chunk_doc(train_set, num_chunks) else:
L = manager.list() processes = [] for chunk_items in chunk_list: p = Process(target=mp_fetch_patent_url, args=(L, chunk_items)) p.start() processes.append(p) for p in processes: p.join() normal_L = list(L) target_doc = [doc for chunk in normal_L for doc in chunk] else: target_doc = fetch_patent_url(date_list) if params_path is not None: pickle_save(params, params_path) if target_doc is not None: print(f'Number of target documents: {len(target_doc)}') if args.mp and checkpoint_save: chunk_list = chunk_doc(target_doc, num_chunks) print(f'Chunked into {len(chunk_list)} chunks') with Manager() as manager: L = manager.list() processes = [] for chunk_items in chunk_list: p = Process(target=main_extraction, args=(chunk_items, L, checkpoint_save,
parser.add_argument("--mode", default='attention') args = parser.parse_args() extracted_pkl = join_path(args.data_folder, ['extracted_txt', 'extracted.pkl']) tensor_folder = join_path(args.data_folder, ['tensor', args.mode]) params_path = join_path(args.data_folder, 'params.pkl') check_mkdir(tensor_folder) dataset = pickle_load(extracted_pkl) for i, app in enumerate(dataset): print(f'{i}/{len(dataset)}') output = encode_attention_app(app) if output is None: continue for j, definition in enumerate(output[2]): save_list = [ output[0], output[1], definition[0], definition[1], definition[2] ] train_name = f'{i}_{j}_train.pkl' train_path = join_path(tensor_folder, train_name) pickle_save(save_list, train_path) del output gc.collect() print('Completed')