if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('dir_name', help='the name of the directory ' 'containing the FoLiA XML files processed') parser.add_argument('dic', help='the liwc dictionary to be used') parser.add_argument('out_file', help='csv file to store the results') args = parser.parse_args() if args.dic.endswith('LIWC_Dutch_dictionary.dic'): encoding = 'latin1' else: encoding = 'utf8' liwc_dict, liwc_categories = load_liwc(args.dic, encoding) act_tag = '{http://ilk.uvt.nl/folia}div' event_tag = '{http://ilk.uvt.nl/folia}event' sentence_tag = '{http://ilk.uvt.nl/folia}s' word_tag = '{http://ilk.uvt.nl/folia}w' text_content_tag = '{http://ilk.uvt.nl/folia}t' result = pd.DataFrame(columns=liwc_categories.values()+['#words']) xml_files = glob.glob('{}/*.xml'.format(args.dir_name)) for i, f in enumerate(xml_files): print '{} ({} of {})'.format(f, i+1, len(xml_files)) num_words = 0 liwc_count = Counter()
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('file_in', help='the name of the FoLiA XML file add ' 'LIWC entities to') parser.add_argument('dir_out', help='the name of the directory to save ' 'the output file to') parser.add_argument('dic', help='json file containing liwc dictionary ' '(e.g., <embem_data_dir>/dict/historic_' 'Dutch_LIWC.dic)') args = parser.parse_args() file_name = args.file_in dir_out = args.dir_out liwc_dict, liwc_categories = load_liwc(args.dic, 'utf8') # Load document context = etree.iterparse(file_name, events=('end',), remove_blank_text=True) annotations_tag = '{http://ilk.uvt.nl/folia}annotations' sentence_tag = '{http://ilk.uvt.nl/folia}s' word_tag = '{http://ilk.uvt.nl/folia}w' text_content_tag = '{http://ilk.uvt.nl/folia}t' for event, elem in context: if elem.tag == annotations_tag: # add entity-annotation for liwc annotation_attrs = { 'annotator': 'liwc',