def main(): in_d, out_d,m,n = utils.argsdirs("Most frequent triples",["n"]) n = int(n) tomes = [ triple.Tome(filename) for filename in utils.filenames(in_d) ] filename_out = utils.new_filename(out_d,"most_frequent.gz") tome_out = triple.Tome(filename_out) print "joining the tomes.." tome_join = triple.Tome(tomes) print "grouping/summing (again).." tome_join = tome_join.group_sum(m) print "sorting the tomes (again).." tome_join = tome_join.sort() print "getting the first %d.."%n tome_join = tome_join.first(n) print "writing everything down.." writer = tome_out.writer() for tr in tome_join: writer(tr) print "done."
def main(): in_d, out_d,_ = utils.argsdirs("Sorting") for filename in utils.filenames(in_d): tome_in = triple.Tome(filename) filename_out = utils.new_filename(out_d,filename) tome_out = triple.Tome(filename_out) writer = tome_out.writer() for tr in tome_in.sort(): writer(tr)
def main(): in_d, out_d, members_groupby = utils.argsdirs("Counting the triples") for filename in utils.filenames(in_d): print "processing file %s.."%filename tome_in = triple.Tome(filename) filename_out = utils.new_filename(out_d,filename) print "writing to %s.."%filename_out tome_out = triple.Tome(filename_out) writer = tome_out.writer() for tr in tome_in.group_sum(members_groupby): writer(tr)
def prepare_tomes(in_d): tomes = [ triple.Tome(filename) for filename in utils.filenames(in_d) ] print "number of tomes found: %d"%len(tomes) tv = triple.TomeVoc(tomes) word_indexes = tv.indexes return tv
def main (): commandline_parser = argparse.ArgumentParser("Pre-processing of data") commandline_parser.add_argument("--data-folder", nargs =1, help="Specifies the path of the folder containing the data.") commandline_parser.add_argument("--output-folder", nargs =1, help="Specifies the path of the output folder.") args = vars(commandline_parser.parse_args()) data_folder = args["data_folder"][0] output_folder = args["output_folder"][0] output_folder = path.join(output_folder,'dataset') if not path.exists(output_folder): makedirs(output_folder) files = utils.filenames(data_folder) for file_path in files: output_path = utils.new_filename(output_folder, file_path) preprocess(file_path, output_path)