示例#1
0
    if args.data_folder is None:
        extracted_pkl = args.extracted_pkl
        tensor_folder = args.tensor_folder
    else:
        extracted_pkl = join_path(args.data_folder,
                                  ['extracted_txt', 'extracted.pkl'])
        params_path = join_path(args.data_folder, 'params.pkl')

        try:
            params = pickle_load(params_path)
        except Exception:
            params = {}
        params['encode_data_test_ratio'] = args.test_ratio
        params['encode_data_random_seed'] = args.random_seed
        pickle_save(params, params_path)

    dataset = pickle_load(extracted_pkl)

    random.Random(1).shuffle(dataset)
    test_size = int(len(dataset) * args.test_ratio)
    test_set = dataset[:test_size]
    train_set = dataset[test_size:]

    train_path = join_path(args.data_folder, 'train_data')
    check_mkdir(train_path)
    print(f'Encoding training data: {len(train_set)}')
    train_dict_word2idx = encode_knn_dataset(train_set, train_path)

    test_path = join_path(args.data_folder, 'test_data')
    check_mkdir(test_path)
        savepath = args.savepath
        if (checkpoint_save is None) and (combinefile is None):
            raise FileNotFoundError('No file path was provided')

    else:
        setup_folder(args.data_folder)
        checkpoint_save = join_path(args.data_folder, 'search_chunks')
        savepath = join_path(args.data_folder, ['extracted_txt',
                                                'extracted.pkl'])
        combinefile = None
    try:
        output_list = pickle_load(combinefile)
    except Exception:
        output_list = combine_checkpoint_file(checkpoint_save)

    app_list = extract_app(output_list)
    print(f'Total number of applications: {len(app_list)}')

    processed_data = []
    for app in tqdm(app_list):
        intro_text = process_intro(app[0])
        claims = process_claim(app[1])
        definitions = process_definition(app[2])
        app_data = [intro_text, claims, definitions]
        if (len(claims) > 1) and (len(definitions) > 1):
            processed_data.append(app_data)

    if savepath is not None:
        print(f'Saving to {savepath}')
        pickle_save(processed_data, savepath)
    parser = argparse.ArgumentParser()

    parser.add_argument("--extracted_pkl", default=None)
    parser.add_argument("--savepath", default=None)
    parser.add_argument("--data_folder", default=None)

    args = parser.parse_args()

    if args.data_folder is None:
        if args.extracted_pkl is not None:
            extracted_pkl = args.extracted_pkl
        else:
            raise FileNotFoundError('No file path was provided')
        savepath = args.savepath
    else:
        setup_folder(args.data_folder)
        extracted_pkl = join_path(args.data_folder,
                                  ['extracted_txt', 'extracted.pkl'])
        savepath = join_path(args.data_folder, ['definition', 'def_dict.pkl'])

    dataset = pickle_load(extracted_pkl)

    print(f'Total number of applications: {len(dataset)}')

    def_example = extract_definition(dataset)

    print(f'Number of unique term: {len(def_example)}')

    print(f'Saving to {savepath}')
    pickle_save(def_example, savepath)
示例#4
0
        else:
            raise FileNotFoundError('No file path was provided')
        savepath = args.savepath
    else:
        savepath = join_path(args.data_folder, ['vocab', 'vocab_tensor.pkl'])
        chunk_folder = join_path(args.data_folder, 'search_chunks')
        chunk_list = combine_checkpoint_file(chunk_folder)
        params_path = join_path(args.data_folder, 'params.pkl')
        try:
            params = pickle_load(params_path)
        except Exception:
            params = {}
        params['extract_vocab_max_length'] = args.max_length
        params['extract_vocab_error_word_list'] = args.error_word_list
        params['extract_vocabmin_freq'] = args.min_freq
        pickle_save(params, params_path)

    app_list = extract_app(chunk_list)
    random.shuffle(app_list)
    subset_size = min(len(app_list), 100)
    sub_list = app_list[:subset_size]

    print(f'Total number of applications: {len(sub_list)}')

    unique_word = {}
    word_count = {}
    for app in tqdm(sub_list):
        definition = ' '.join(app[2])
        definition = set(definition.lower().split(' '))
        definition = [w for w in definition if len(w) < args.max_length]
        for word in definition:
示例#5
0
def encode_knn_dataset(dataset, save_path, term_pattern=r'".+?"'):
    dict_word2idx = {}

    for i, app in tqdm(enumerate(dataset)):
        intro = app[0]
        claims = ' '.join(app[1])
        intro_tensor = encode_data(intro).numpy()
        claims_tensor = encode_data(claims).numpy()

        definitions = app[2]
        for j, def_entry in enumerate(definitions):
            try:
                term = extract_term_from_definition(def_entry)
                if len(term) > 1:
                    term = term[0]
                term_tensor = encode_data(term).numpy()

                definition_tokens = text_to_word_sequence(def_entry)
                definition_tokens.append('[END]')

                preword_list = []
                target_word_list = []
                for i in range(4, len(definition_tokens)):
                    preword = ' '.join(definition_tokens[:i])
                    target_word = definition_tokens[i]

                    preword_list.append(preword)
                    target_word_list.append(target_word)

                preword_tensors = encode_data(preword_list).numpy()
                context = [intro_tensor, claims_tensor, term_tensor]
                context_np = np.concatenate(context, axis=1)
                context_np = np.repeat(context_np,
                                       preword_tensors.shape[0],
                                       axis=0)

                keys_np = np.concatenate([context_np, preword_tensors], axis=1)

                word_idx_list = []
                for word in target_word_list:
                    if word not in dict_word2idx.keys():
                        dict_word2idx[word] = len(dict_word2idx)
                    word_idx_list.append(dict_word2idx[word])

                vals_np = np.array(word_idx_list).reshape(
                    (len(word_idx_list), 1))

                keys_fpath = join(save_path, f'{i}_{j}_keys.npy')
                vals_fpath = join(save_path, f'{i}_{j}_vals.npy')

                np.save(keys_fpath, keys_np)
                np.save(vals_fpath, vals_np)

                del keys_np
                del intro_tensor
                del claims_tensor
                del preword_tensors
                del context_np
                del term_tensor
            except Exception:
                continue

    dict_path = join(save_path, 'dict_word2idx.pkl')
    pickle_save(dict_word2idx, dict_path)

    return dict_word2idx
        tensor_folder = args.tensor_folder
    else:
        extracted_pkl = join_path(args.data_folder,
                                  ['extracted_txt', 'extracted.pkl'])
        tensor_folder = join_path(args.data_folder, ['tensor', args.mode])
        params_path = join_path(args.data_folder, 'params.pkl')

        check_mkdir(tensor_folder)
        try:
            params = pickle_load(params_path)
        except Exception:
            params = {}
        params['encode_data_test_ratio'] = args.test_ratio
        params['encode_data_random_seed'] = args.random_seed
        params['encode_data_chunk_size'] = args.chunk_size
        pickle_save(params, params_path)

    dataset = pickle_load(extracted_pkl)

    random.Random(1).shuffle(dataset)
    test_size = int(len(dataset) * args.test_ratio)
    test_set = dataset[:test_size]
    train_set = dataset[test_size:]

    if args.chunk_size == 0:
        num_chunks = len(train_set)
        train_chunks = chunk_doc(train_set, num_chunks)
    elif len(train_set) > args.chunk_size:
        num_chunks = int(len(train_set) / args.chunk_size)
        train_chunks = chunk_doc(train_set, num_chunks)
    else:
示例#7
0
                L = manager.list()
                processes = []
                for chunk_items in chunk_list:
                    p = Process(target=mp_fetch_patent_url,
                                args=(L, chunk_items))
                    p.start()
                    processes.append(p)
                for p in processes:
                    p.join()
                normal_L = list(L)
                target_doc = [doc for chunk in normal_L for doc in chunk]
        else:
            target_doc = fetch_patent_url(date_list)

    if params_path is not None:
        pickle_save(params, params_path)

    if target_doc is not None:
        print(f'Number of target documents: {len(target_doc)}')
        if args.mp and checkpoint_save:
            chunk_list = chunk_doc(target_doc, num_chunks)
            print(f'Chunked into {len(chunk_list)} chunks')

            with Manager() as manager:
                L = manager.list()
                processes = []
                for chunk_items in chunk_list:
                    p = Process(target=main_extraction,
                                args=(chunk_items,
                                      L,
                                      checkpoint_save,
    parser.add_argument("--mode", default='attention')

    args = parser.parse_args()

    extracted_pkl = join_path(args.data_folder,
                              ['extracted_txt', 'extracted.pkl'])
    tensor_folder = join_path(args.data_folder, ['tensor', args.mode])
    params_path = join_path(args.data_folder, 'params.pkl')

    check_mkdir(tensor_folder)

    dataset = pickle_load(extracted_pkl)

    for i, app in enumerate(dataset):
        print(f'{i}/{len(dataset)}')
        output = encode_attention_app(app)
        if output is None:
            continue
        for j, definition in enumerate(output[2]):
            save_list = [
                output[0], output[1], definition[0], definition[1],
                definition[2]
            ]
            train_name = f'{i}_{j}_train.pkl'
            train_path = join_path(tensor_folder, train_name)
            pickle_save(save_list, train_path)
        del output
        gc.collect()

    print('Completed')