Exemplo n.º 1
0
import os
import pickle
import util

INPUT_FOLDER = '../data/plaintext/'
OUTPUT_FOLDER = '../data/frogged/'

files = util.todo_filepaths(INPUT_FOLDER, '.txt', OUTPUT_FOLDER, '.frog.out')
files = sorted(files)
print "N files TODO:", len(files), files[:10]
with open('../data/_frog_todo.p', 'wb') as f:
    pickle.dump(files, f)
Exemplo n.º 2
0
#Using a partial function did not work, as that can not be pickled.
def __extract_plaintext_as_tuple(filename_outfolder_tuple):
    filename, out_folder = filename_outfolder_tuple
    return extract_plaintext(filename, out_folder)


def extract_all_plaintext(filenames, out_folder=PLAINTEXT_FOLDER):
    print "EXTRACTING PLAINTEXT FROM {0} FILES INTO {1}".format(len(filenames),out_folder)

    #Zip the filename input with the output folder
    tuple_input = zip(filenames, [out_folder]*len(filenames))

    pool = Pool(processes=util.CPU_COUNT)
    #pool = Pool(processes=1)
    num_tasks = len(filenames)
    for i, _ in enumerate(pool.imap_unordered(__extract_plaintext_as_tuple, tuple_input), 1):
        sys.stderr.write('\rdone {0:%}'.format(i/num_tasks))
    pool.close()

    print "\nDONE"

if __name__ == '__main__':
    in_folder = RAW_DATA_FOLDER
    out_folder = PLAINTEXT_FOLDER

    todo_filenames = util.todo_filepaths(in_folder,'.xml', out_folder,'.txt', blacklist=BLACKLIST)

    all_filenames = util.todo_filepaths(in_folder,'.xml', blacklist=BLACKLIST)
    #extract_all_plaintext(todo_filenames, out_folder)
    extract_all_labels(all_filenames, DATA_FOLDER+'labels.p')
Exemplo n.º 3
0
import os
import pickle
import util

INPUT_FOLDER = '../data/plaintext/'
OUTPUT_FOLDER = '../data/frogged/'

files = util.todo_filepaths(INPUT_FOLDER, '.txt', OUTPUT_FOLDER, '.frog.out')
files = sorted(files)
print "N files TODO:", len(files), files[:10]
with open('../data/_frog_todo.p','wb') as f:
    pickle.dump(files, f)
Exemplo n.º 4
0
    print "EXTRACTING PLAINTEXT FROM {0} FILES INTO {1}".format(
        len(filenames), out_folder)

    #Zip the filename input with the output folder
    tuple_input = zip(filenames, [out_folder] * len(filenames))

    pool = Pool(processes=util.CPU_COUNT)
    #pool = Pool(processes=1)
    num_tasks = len(filenames)
    for i, _ in enumerate(
            pool.imap_unordered(__extract_plaintext_as_tuple, tuple_input), 1):
        sys.stderr.write('\rdone {0:%}'.format(i / num_tasks))
    pool.close()

    print "\nDONE"


if __name__ == '__main__':
    in_folder = RAW_DATA_FOLDER
    out_folder = PLAINTEXT_FOLDER

    todo_filenames = util.todo_filepaths(in_folder,
                                         '.xml',
                                         out_folder,
                                         '.txt',
                                         blacklist=BLACKLIST)

    all_filenames = util.todo_filepaths(in_folder, '.xml', blacklist=BLACKLIST)
    #extract_all_plaintext(todo_filenames, out_folder)
    extract_all_labels(all_filenames, DATA_FOLDER + 'labels.p')