Пример #1
0
def corpus_parser(data_dir, to_index_dir, pool_size):

    #     data_dir = '/ssd2/francisco/robust_corpus/'

    #     to_index_dir = './robust_dir/robust_corpus/'

    #     pool_size = 10

    ir_utils.create_dir(to_index_dir)

    corpus_files = get_filenames(data_dir)
    print(len(corpus_files))
    #     corpus_files = corpus_files[0:10]

    pool = multiprocessing.Pool(processes=pool_size, initializer=start_process)

    #     pool_outputs = pool.map(baseline_computing, params)

    process_file_partial = partial(process_file, to_index_dir)

    pool.map_async(process_file_partial, corpus_files)

    pool.close()  # no more tasks

    pool.join()  # wrap up current tasks
Пример #2
0
def process_file(to_index_dir, file):

    filename = file.split('/')[-1:][0]
    #     print(filename)
    print(to_index_dir)
    outdir = to_index_dir + '/'.join(file.split('/')[-2:-1]) + '/'

    ir_utils.create_dir(outdir)

    file_out = outdir + filename

    open_tags = ['<H3>', '<HT>', '<TEXT>', '<HEADLINE>']
    close_tags = ['</H3>', '</HT>', '</TEXT>', '</HEADLINE>']
    try:
        with open(file, 'rt', encoding="ISO-8859-1") as input_f, open(
                file_out, 'wt', encoding="utf-8") as out_f:

            lines = []
            #         i = 0
            open_tag = False
            for line in input_f:
                #             i += 1
                #             if i> 200:
                #                 break
                if any(tag in line for tag in close_tags):
                    out_f.write(line)
                    open_tag = False
                    #                 print(line)
                    continue

                elif any(tag in line for tag in open_tags):
                    open_tag = True
                    out_f.write(line)
                    #                 print(line)
                    continue

                if open_tag:
                    #                 print('change')
                    line = ir_utils.remove_sc(line) + '\n'

                out_f.write(line)
                print('Saved :', file_out)
    except:
        print('error processing file :', file)
def corpus_parser(data_dir, to_index_dir, pool_size):

    #     pool_size = 25
    #     data_dir = '/ssd/francisco/pubmed19/'
    #     to_index_dir = './bioasq_dir/bioasq_corpus/' # TODO Fix, pass to multiprocessing!

    ir_utils.create_dir(to_index_dir)

    pubmed_files = get_filenames(data_dir)

    # assign to the multiprocessing pool

    pool = multiprocessing.Pool(processes=pool_size, initializer=start_process)

    #     pool_outputs = pool.map(baseline_computing, params)

    pool.map_async(pubmed_xml_to_json, pubmed_files)

    pool.close()  # no more tasks

    pool.join()  # wrap up current tasks
    def build(self):

        #         ir_utils.create_dir(self.index_location)
        #     index_loc_param = '--indexPath=' + index_loc
        ir_utils.create_dir(self.index_dir)
        build_index_command = self.ir_toolkit_location + 'buildindex/IndriBuildIndex'
        toolkit_parameters = [
            build_index_command, self.parameter_file_location,
            self.stopwords_file
        ]

        print(toolkit_parameters)

        proc = subprocess.Popen(toolkit_parameters,
                                stdin=subprocess.PIPE,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT,
                                shell=False)
        (out, err) = proc.communicate()
        print(out.decode("utf-8"))
        print('Index error: ', err)
        if err == None:
            return 'Ok'
    parser = argparse.ArgumentParser(
        description='Example 1 - sequential and local execution.')
    parser.add_argument('--dataset', type=str, help='')
    parser.add_argument('--data_split', type=str, help='')
    parser.add_argument('--fold', type=str, help='')

    args = parser.parse_args()
    #     args = fakeParser()
    ir_toolkit_location = '../../../indri-l2r/'

    dataset = args.dataset
    workdir = './' + dataset + '_dir/'
    to_index_dir = workdir + dataset + '_corpus/'
    index_dir = workdir + dataset + '_indri_index'
    ir_utils.create_dir(workdir)
    confdir = './' + dataset + '_config/'
    parameter_file_location = confdir + dataset + '_index_param_file'
    stopwords_file = confdir + 'stopwords'

    if (not args.fold or args.dataset == 'bioasq'):
        args.fold = ['']
    elif args.fold == 'all':
        args.fold = ['1', '2', '3', '4', '5']
#         args.fold = ['1']
    else:
        args.fold = [args.fold]

    # Generate all features for robust
    # Later, divide them according to fold and data_split
    if args.dataset == 'robust':