data = {} csv_obj = csv.reader(text, delimiter="\t", quotechar='"') rows = [x for x in csv_obj] variables = list( set([ y[0] for x in rows for y in re.findall("%(\w+)(\([\w,]*\))?%", x[0]) ])) data['length'] = len(rows) data['speech_acts'] = list( set([y for x in rows for y in x[1].split(',') if bool(y)])) data['num_speech_acts'] = len(data['speech_acts']) data['variables'] = variables # TODO parse text? return data if __name__ == "__main__": queue = [join("data", "tsv")] input_ext = ".tsv" output_lists = [] output_ext = ".tsv_analysis" utils.standard_main(queue, input_ext, extract_from_file, output_lists, output_ext)
def accum_final(data): return data if __name__ == "__main__": queue = join("data","xml","CiFStates") input_ext = ".xml" output_lists = [] output_ext = ".xml_rule_analysis" initial_accum = {'_cif_state_components': set(), '_prom_week_components' : set(), '_cif_library_components' : set(), '_cif_state_counts': {}, '_prom_week_counts' : {}, '_cif_library_counts' : {}, '__all_counts' : {} } utils.standard_main(queue, input_ext, extract_from_file, output_lists, output_ext, accumulator=accumulator, accumulator_final=accum_final, init_accum=initial_accum)