def format_ds(input_file): """Reads the file at the path pointed at by input_file and returns Docuscope-formatted results from the Ity DocuscopeTagger, in string form""" with open(input_file, 'r') as f: text_contents = f.read() tokenizer = RegexTokenizer() tokens = tokenizer.tokenize(text_contents) tagger = DocuscopeTagger(return_included_tags=True) tags = tagger.tag(tokens) # do an ugly hack to fix lat names for t in tags[1]: new_tag = list(t['rules'][0]) new_tag[0] = new_tag[0].rsplit('.')[-1] new_rules = list(t['rules']) new_rules.pop(0) new_rules.insert(0, new_tag) t['rules'] = tuple(new_rules) formatter = LATFormatter.LATFormatter() return formatter.format(tags=tags, tokens=tokens, s=text_contents, input_file=input_file)
def ngramCorpus(args): corpus_path = args.corpus_path if not os.path.exists(corpus_path): raise ValueError("Invalid input corpus input path.", corpus_path, "does not exist on disk.") ncount = int(args.ngram_count) if ncount > 3 or ncount < 1: raise ValueError("Invalid parameter: ngram count must be between 1 and 3.") #instantiate tokenizer tokenizer = RegexTokenizer() tokens = [] bad_files = [] #traverse files and tokenize documentNgramCounts = defaultdict(int) # to count number of documents ngrams appear in corpusNgramCounts = defaultdict(int) per_doc_path = None if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) if args.per_doc: per_doc_path = os.path.join(args.output_dir, 'perDocNgrams') if not os.path.exists(per_doc_path): os.mkdir(per_doc_path) c = 0 for dirpath, subdirs, files in os.walk(corpus_path): for file in files: if '.txt' in file: print(c) c+=1 filepath = os.path.join(dirpath, file) try: #tokenize tokens = tasks.tokenizeText(filepath, False, None, None, tokenizer) #process out punctuation tokens = tasks.ngramProcess(tokens, args.ngram_pun) # update corpus dictionaries docCounts = tasks.ngramUpdate(tokens, documentNgramCounts, corpusNgramCounts, ncount, args.ngram_pun) if args.per_doc: docName = os.path.splitext(os.path.basename(filepath))[0] ngramCSV(documentNgramCounts=None, corpusNgramCounts=docCounts, maxN=ncount, output_dir=per_doc_path, name=docName, doc=True) except NotImplementedError: bad_files.append(filepath) ngramCSV(documentNgramCounts=documentNgramCounts, corpusNgramCounts=corpusNgramCounts, maxN=ncount, output_dir=args.output_dir, name=os.path.basename(corpus_path), doc=False) print("Completed ngram processing of corpus " + os.path.basename(corpus_path)) if bad_files != []: print("Unable to ngram the following files" + str(bad_files))
def tag_text(text_path, corpus_info, corpus_data_files, tags, formats=None, write_to_disk=False): # Open the text file and get its contents. if not os.path.exists(text_path): raise ValueError("Text file '%s' does not exist." % text_path) text_name = os.path.basename(text_path) text_file = codecs.open(text_path, encoding="UTF-8") text_contents = text_file.read() text_file.close() # Tokenize. tokenizer = RegexTokenizer() tokens = tokenizer.tokenize(text_contents) # Import and instantiate the taggers. tag_dicts = {} tag_maps = {} # TODO: Parallelize? for tag_name, tag_args in tags.items(): if tag_name in tag_dicts or tag_name in tag_maps: raise NotImplementedError("Tagging multiple times with the same tagger is not yet supported.") tagger_name = tag_name + "Tagger" tagger_module = getattr(__import__("Ity.Taggers", fromlist=tagger_name), tagger_name) # Add some additional instantiation arguments for specific taggers. # TODO: Clean up Taggers' init() arguments. if tag_args is None: tagger_init_args = {} else: tagger_init_args = tag_args # Optionally use the rules file that was uploaded with the if tag_name == "SimpleRule" and ( "SimpleRule" in corpus_data_files and "saved" in corpus_data_files["SimpleRule"] and len(corpus_data_files["SimpleRule"]["saved"]) > 0 ): if "rules_filename" not in tagger_init_args: if len(corpus_data_files["SimpleRule"]["saved"]) > 1: raise NotImplementedError("Multiple rules files for SimpleRuleTagger is not yet supported.") tagger_init_args.update( rules_filename=corpus_data_files["SimpleRule"]["saved"][0] ) # Otherwise, SimpleRuleTagger will use the default rules file it knows the path to internally. elif tag_name == "TopicModel": tagger_init_args.update( corpus_name=corpus_info["name"] ) # Instantiate this tagger. tagger_instance = tagger_module(**tagger_init_args) # Tag with this tagger. single_tag_data, single_tag_maps = tagger_instance.tag(tokens) tag_dicts[tag_name] = single_tag_data tag_maps[tag_name] = single_tag_maps # Return the text name, list of tag dicts, and some token counts. output_dict = dict( text_path=text_path, text_name=text_name, text_key=nameToKey(os.path.splitext(text_name)[0]), corpus_name=corpus_info["name"], text_contents=text_contents, # tokens=tokens, tag_dicts=tag_dicts, # tags=tags, num_tokens=len(tokens), num_word_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["WORD"] ]), num_punctuation_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["PUNCTUATION"] ]), num_included_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] not in tokenizer.excluded_token_types ]), num_excluded_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] in tokenizer.excluded_token_types ]) ) if formats is not None: format_outputs = format_text(tag_maps, tokens, output_dict, corpus_info, formats, write_to_disk=write_to_disk) output_dict["format_outputs"] = format_outputs output_dict["html_name"] = os.path.basename(format_outputs["HTML"]["app"]) # del output_dict["tags"] return output_dict
def _tag_text_with_existing_instances(text_path, corpus_info, corpus_data_files, taggers, formatters=None, write_to_disk=False): # Open the text file and get its contents. if not os.path.exists(text_path): raise ValueError("Text file '%s' does not exist." % text_path) text_name = os.path.basename(text_path) # Try to decode the file with multiple encodings text_file = None text_contents = None for encoding in ["UTF-8", "ISO-8859-1", "CP1252"]: try: text_file = codecs.open(text_path, encoding=encoding) text_contents = text_file.read() break except UnicodeDecodeError: pass finally: if text_file is not None: text_file.close() if text_contents is None: raise NotImplementedError("Could not find a valid encoding for input file %s" % text_path) # Tokenize. tokenizer = RegexTokenizer() tokens = tokenizer.tokenize(text_contents) # Import and instantiate the taggers. tag_dicts = {} tag_maps = {} # TODO: Parallelize? for tag_name, tagger in taggers.items(): if tag_name in tag_dicts or tag_name in tag_maps: raise NotImplementedError("Tagging multiple times with the same tagger is not yet supported.") # Tag with this tagger. single_tag_data, single_tag_maps = tagger.tag(tokens) tag_dicts[tag_name] = single_tag_data tag_maps[tag_name] = single_tag_maps # Return the text name, list of tag dicts, and some token counts. output_dict = dict( text_path=text_path, text_name=text_name, text_key=nameToKey(os.path.splitext(text_name)[0]), corpus_name=corpus_info["name"], text_contents=text_contents, # tokens=tokens, tag_dicts=tag_dicts, # tags=tags, num_tokens=len(tokens), num_word_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["WORD"] ]), num_punctuation_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["PUNCTUATION"] ]), num_included_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] not in tokenizer.excluded_token_types ]), num_excluded_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] in tokenizer.excluded_token_types ]) ) if formatters is not None: format_outputs = _format_text_with_existing_instances(tag_maps, tokens, output_dict, corpus_info, formatters, write_to_disk=write_to_disk) output_dict["format_outputs"] = format_outputs output_dict["html_name"] = os.path.basename(format_outputs["HTML"]["app"]) # del output_dict["tags"] return output_dict
def csv_formatter_app(args): # Get the input files with the appropriate file extension. patterns = None if args.file_extension is not None: patterns = ("\." + args.file_extension + "$",) # Figure out which tagger we need. imported_tagger = getattr(__import__("Ity.Taggers", fromlist=[args.tagger_module_name]), args.tagger_module_name) # Make sure the corpus folder at corpus_path exists. # If args.corpus_path is an absolute path, os.path.join() will do the right thing. corpus_path = os.path.join( corpus_root, args.corpus_path ) if not os.path.exists(corpus_path): raise ValueError("Corpus at path '%s' does not exist.") # TopicModelTagger and a few other things may need this. corpus_name = os.path.basename(corpus_path) # Filter by file names in the corpus. if args.filenames is not None and len(args.filenames) > 0: for index, filename in enumerate(args.filenames): args.filenames[index] = os.path.join(corpus_path, filename) input_paths = FilePaths.valid_paths(args.filenames, patterns=patterns, recursion_levels=3, debug=args.debug) else: input_paths = FilePaths.valid_paths((corpus_path,), patterns=patterns, recursion_levels=3, debug=args.debug) ################################ #### Initialize Ity Modules #### ################################ tokenizer = RegexTokenizer() # Instantiate *one* tagger. Note that TopicModelTagger needs a model_path given to it. # TODO: Support for multiple taggers. # TODO: Run the TopicModel generator for a brand new corpus for which we have no metadata. # TODO: It seems like TopicModelTagger implies some kind of CorpusTagger with corpus-specific data. It'd be good to make that a real subclass. if args.tagger_module_name == "TopicModelTagger": tagger = imported_tagger(corpus_name=corpus_name) # Use the rules filename for SimpleRuleTagger if we got one. Otherwise, SimpleRuleTagger will use the rules in "default.csv". elif args.tagger_module_name == "SimpleRuleTagger" and args.rules_file is not None: tagger = imported_tagger(rules_filename=args.rules_file) else: tagger = imported_tagger() formatter = CSVFormatter() # Keep calm and DO THINGS tags_list = [] tokens_list = [] str_list = [] text_name_list = [] # Process each text in the corpus. for path_index, path in enumerate(input_paths): # Get the name of the text. That appears as output in the CSV. text_name = os.path.splitext(os.path.basename(path))[0] text_name_list.append(text_name) start_time = time() # Open the file and get its contents. the_file = codecs.open(path, encoding="utf-8") the_str = the_file.read() the_file.close() str_list.append(the_str) # Tokenize tokens = tokenizer.tokenize(the_str) tokens_list.append(tokens) # Tag tag_data, tag_maps = tagger.tag(tokens) tags_list.append([tag_data, tag_maps]) end_time = time() # Debug output if args.debug: message = "\t** Processed '%s' (%u / %u) in %f seconds. **" % ( os.path.basename(path), path_index + 1, len(input_paths), end_time - start_time ) print message # Output the CSV. csv_str = formatter.batch_format( tags_list=tags_list, tokens_list=tokens_list, corpus_name=corpus_name, s_list=str_list, text_name_list=text_name_list ) # Write the csv_str out to a file. if args.output_filename is None: csv_filename = corpus_name + "_" + args.tagger_module_name + ".csv" else: csv_filename = args.output_filename # Do we have a specified output directory in the args object? if args.output_dir is not None: csv_dir = os.path.abspath( os.path.expanduser(args.output_dir) ) else: # Output the CSV in the current working directory by default. csv_dir = os.path.abspath(os.path.dirname(__file__)) # Create the output directory if it doesn't exist. if not os.path.exists(csv_dir): os.makedirs(csv_dir) # Get the full file path to the output CSV. csv_path = os.path.join( csv_dir, csv_filename ) # Write the CSV to disk. try: csv_file = codecs.open(csv_path, encoding="utf-8", mode="w") csv_file.write(csv_str) csv_file.close() # Debug output if args.debug: message = "** Wrote CSV containing tagged data for corpus '%s' to '%s'. **" % (corpus_name, csv_path) print message return csv_path except IOError: if args.debug: message = "**** Error writing out CSV containing tagged data for corpus '%s' to '%s'. ****" % (corpus_name, csv_path) print message return None
def tag_corpus(corpus_info, corpus_data_files, email='', tags=ImmutableDict(Docuscope={ "return_included_tags": True, "return_excluded_tags": False }), create_zip_archive=False, ngram_count=0, ngram_pun=False, ngram_per_doc=False, chunk_text=False, chunk_length=None, chunk_offset=None, blacklist_path=None, blacklist_words='', rule_csv=False, doc_rule=False, defect_count=False, name='', app_mode=False, current_task=None, logger=None, token_csv=False, verbose=True, includeTagViewer=False): print('Starting tag_corpus...') tag_corpus_start = time() timing = [] # Validate corpus_info. if "path" not in corpus_info or "name" not in corpus_info or "data" not in corpus_info: raise ValueError("Invalid corpus_info provided.") # Validate parameters if chunk_text: if chunk_length is None: chunk_length = 2000 #default chunk size else: chunk_length = int(chunk_length) if chunk_offset is None: chunk_offset = chunk_length #default offset is chunk length else: chunk_offset = int(chunk_offset) if chunk_length < chunk_offset: raise ValueError( "Invalid chunking parameters: chunk_offset must be <= chunk_length." ) else: if chunk_length is not None or chunk_offset is not None: raise ValueError( "Text chunking must be enabled to set chunk_length or chunk_offset." ) if ngram_count is None and ngram_pun == True: raise ValueError( "Ngrams must be enabled to set ngram punctuation count.") if ngram_count is None: ngram_count = 0 if int(ngram_count) < 0 or int(ngram_count) > 3: raise ValueError("Ngram count must be between 1 and 3.") else: ngram_count = int(ngram_count) if name != '': #add a hyphen to match output naming scheme name = name + "-" if doc_rule and not rule_csv: raise ValueError( "Must enable rule counting to enable per document rule information." ) if chunk_text and rule_csv: raise ValueError( 'Rule counting and chunking cannot be performed simultaneously.') if chunk_text and token_csv: raise ValueError( 'Token csvs and chunking cannot be performed simultaneously.') # Validate blacklist params and retrieve blacklist words blacklist = [] if blacklist_path is not None: if not os.path.exists( blacklist_path) or blacklist_path.endswith('.txt') is False: raise ValueError( "Blacklist text file '%s' does not exist. Please supply a valid space-separated .txt file." % blacklist_path) else: try: f = open(blacklist_path) for line in f: words = line.split() blacklist.extend(words) except IOError: raise ValueError( "Unable to open blacklist file %s. Please supply a valid space-separated .txt file." % blacklist_path) # Or, retrieve blacklisted words from GUI elif blacklist_words is not None: blacklist = str(blacklist_words).split() # Add an id to the corpus_info dict. corpus_info["processing_id"] = "".join([ corpus_info["name"], "_", "-".join(tags.keys()), "_", socket.gethostname(), "_", str(int(time())) ]) # Validate Taggers. tagger_instances = {} if len(tags) > 1: raise ValueError( "Tagging texts with multiple taggers isn't supported yet.") for tag_name in tags.keys(): try: __import__("Ity.Taggers." + tag_name + "Tagger") except: raise ValueError("A Tagger module for '%s' tags does not exist." % tag_name) is_docuscope = True # Instantiate Taggers. start = time() for tag_name, tag_args in tags.items(): tagger_name = tag_name + "Tagger" tagger_module = getattr( __import__("Ity.Taggers", fromlist=tagger_name), tagger_name) # Add some additional instantiation arguments for specific taggers. # TODO: Clean up Taggers' init() arguments. if tag_args is None: tagger_init_args = {} else: tagger_init_args = tag_args # custom rules file if tag_name == "Docuscope" and ( "SimpleRule" in corpus_data_files and "saved" in corpus_data_files["SimpleRule"] and len(corpus_data_files["SimpleRule"]["saved"]) > 0): is_docuscope = False tagger_init_args.update( dictionary_path=corpus_data_files["SimpleRule"]["saved"][0], return_untagged_tags=True, return_unrecognized_tags=True, blacklist=blacklist) else: tagger_init_args.update( return_untagged_tags=True, return_unrecognized_tags=True, return_excluded_tags= False, # prevents display/tagging of whitespace return_included_tags=True, blacklist=blacklist, ) # Instantiate this Tagger. # optimization: detailed tag data isn't required UNLESS we generate tag-level rule statistics if rule_csv or token_csv: tagger_init_args.update(return_tag_maps=True) tagger_instance = tagger_module(**tagger_init_args) tagger_instances[tag_name] = tagger_instance timing.append(('Instantiate Taggers', time() - start)) # Get all the texts in this corpus...if there are any? if "Text" not in corpus_info["data"] or len( corpus_data_files["Text"]["saved"]) == 0: raise StandardError("No corpus texts to tag!") text_paths = corpus_data_files["Text"]["saved"] if app_mode: # Logging logger.info("Email: %s; Corpus: %s; # Texts: %u." % (email, corpus_info["name"], len(corpus_data_files["Text"]["saved"]))) # Update progress. if current_task.request.id is not None: current_task.update_state(state='PROGRESS', meta={ 'current': 0.0, 'total': 100.0, "model_path": corpus_info["name"] }) # initialize primary csv csv_path = getCSVPath(corpus_info, name, 'gen') lats = sorted(tagger_instances['Docuscope']._ds_dict.lats) header_keys = getHeaderKeys(lats, is_docuscope, defect_count) u = open(csv_path, 'wb') uwriter = csv.writer(u, delimiter=',', quoting=csv.QUOTE_MINIMAL) uwriter.writerow(header_keys) #initialize rule dict corpus_map = dict() if ngram_count > 0: documentNgramCounts = defaultdict( int) # to count number of documents ngrams appear in corpusNgramCounts = defaultdict(int) start = time() tokenizer = RegexTokenizer() timing.append(('Tokenizer Initialization Time: ', time() - start)) # tag each text tag_start = time() index = 0 tokenization_time = 0 tagging_time = 0 # initialize unreadable files list bad_texts = [] # gleicher, summer 2021 - make this an enumerate to keep track of how much we've done # just for printing for ct, text_path in enumerate(text_paths): if verbose: print("file {} of {}".format(ct, len(text_paths))) # tokenize start = time() try: tokens = tokenizeText(text_path, defect_count, chunk_length, chunk_offset, tokenizer) # skip texts that can't be tokenized except IOError: bad_texts.append(text_path) continue if token_csv: token_frame = TokenTransform.transformToFrame(tokens) tokenization_time += (time() - start) start = time() result = tagText(tagger_instance, tag_name, text_path, tokens, corpus_info, chunk=chunk_text, rule_csv=rule_csv, token_csv=token_csv) if token_csv: tagged_frame = TokenTransform.tagFrameMerge(token_frame, result) result["token_csv_name"] = result[ 'text_key'] + '-ubiq-tokens' + '.csv' # gleicher - change file names to match the input file names # filename = result['text_key'] filename = os.path.splitext(result['text_name'])[0] tokenCSVPath = getCSVPath(corpus_info, name, type='token_csv', docName=filename) tagged_frame.to_csv(tokenCSVPath, index=False, header=False, encoding='utf-8') else: if chunk_text: for r in result: r["token_csv_name"] = "" else: result["token_csv_name"] = "" tagging_time += (time() - start) # iterate through tokens (or sub-lists of tokens) and calculate token level statistics (if necessary) # then delete tokens to free up space if chunk_text: if defect_count: for i in range(len(result)): result[i] = defectProcess(result[i], tokens[0][i]) else: if defect_count: result = defectProcess(result, tokens) if chunk_text: tokens = tokens[1] if ngram_count > 0: ngram_tokens = ngramProcess(tokens, ngram_pun) # done with tokens, free up memory del tokens # write out primary csv if chunk_text: for text_dict in result: row = result_to_gen_row( text_dict, header_keys, ) uwriter.writerow(row) else: row = result_to_gen_row(result, header_keys) uwriter.writerow(row) # update corpus dictionaries if rule_csv: rule_map = result['rule_map'] updateCorpusCounts(rule_map, corpus_map) if doc_rule: perDocRuleCSV( corpus_info, result['text_key'], rule_map ) # generate PER document CSVs all in one method (they all need separate writers anyway) del result if ngram_count > 0: docCounts = ngramUpdate(ngram_tokens, documentNgramCounts, corpusNgramCounts, ngram_count, ngram_pun) if ngram_per_doc: docName = os.path.splitext(os.path.basename(text_path))[0] ngramCSV(documentNgramCounts=None, corpusNgramCounts=docCounts, maxN=ngram_count, corpus_info=corpus_info, name=name, docName=docName) if app_mode: if current_task.request.id is not None: current_task.update_state(state='PROGRESS', meta={ 'current': float(index + 1) / len(text_paths) * 100.0, 'total': 100.0 }) index = index + 1 u.close() timing.append(('Total Tokenization', tokenization_time)) timing.append(('Total Tagging', tagging_time)) # write out corpus-wide rule CSV (if applicable) if rule_csv: ruleCSV(corpus_info, name, corpus_map) if ngram_count > 0: ngramCSV(documentNgramCounts, corpusNgramCounts, ngram_count, corpus_info, name) frame = inspect.currentframe() tc_args, _, _, tc_values = inspect.getargvalues(frame) buildReadme(tc_args, tc_values, timing, version, blacklist, bad_texts) if app_mode: # Update progress. (Web version) if current_task.request.id is not None: current_task.update_state(state='PROGRESS', meta={ 'current': 100.0, 'total': 100.0 }) if token_csv: TVpath = os.path.join(corpus_info["output_path"], corpus_info["provenance"], 'TextViewer.html') print(TVpath) if includeTagViewer: shutil.copyfile('TextViewer.html', TVpath) print('tag_corpus finished. Total elapsed time: %.2f seconds.' % (time() - tag_corpus_start)) return csv_path
def csv_formatter_app(args): # Get the input files with the appropriate file extension. patterns = None if args.file_extension is not None: patterns = ("\." + args.file_extension + "$", ) # Figure out which tagger we need. imported_tagger = getattr( __import__("Ity.Taggers", fromlist=[args.tagger_module_name]), args.tagger_module_name) # Make sure the corpus folder at corpus_path exists. # If args.corpus_path is an absolute path, os.path.join() will do the right thing. corpus_path = os.path.join(corpus_root, args.corpus_path) if not os.path.exists(corpus_path): raise ValueError("Corpus at path '%s' does not exist.") # TopicModelTagger and a few other things may need this. corpus_name = os.path.basename(corpus_path) # Filter by file names in the corpus. if args.filenames is not None and len(args.filenames) > 0: for index, filename in enumerate(args.filenames): args.filenames[index] = os.path.join(corpus_path, filename) input_paths = FilePaths.valid_paths(args.filenames, patterns=patterns, recursion_levels=3, debug=args.debug) else: input_paths = FilePaths.valid_paths((corpus_path, ), patterns=patterns, recursion_levels=3, debug=args.debug) ################################ #### Initialize Ity Modules #### ################################ tokenizer = RegexTokenizer() # Instantiate *one* tagger. Note that TopicModelTagger needs a model_path given to it. # TODO: Support for multiple taggers. # TODO: Run the TopicModel generator for a brand new corpus for which we have no metadata. # TODO: It seems like TopicModelTagger implies some kind of CorpusTagger with corpus-specific data. It'd be good to make that a real subclass. if args.tagger_module_name == "TopicModelTagger": tagger = imported_tagger(corpus_name=corpus_name) # Use the rules filename for SimpleRuleTagger if we got one. Otherwise, SimpleRuleTagger will use the rules in "default.csv". elif args.tagger_module_name == "SimpleRuleTagger" and args.rules_file is not None: tagger = imported_tagger(rules_filename=args.rules_file) else: tagger = imported_tagger() formatter = CSVFormatter() # Keep calm and DO THINGS tags_list = [] tokens_list = [] str_list = [] text_name_list = [] # Process each text in the corpus. for path_index, path in enumerate(input_paths): # Get the name of the text. That appears as output in the CSV. text_name = os.path.splitext(os.path.basename(path))[0] text_name_list.append(text_name) start_time = time() # Open the file and get its contents. the_file = codecs.open(path, encoding="utf-8") the_str = the_file.read() the_file.close() str_list.append(the_str) # Tokenize tokens = tokenizer.tokenize(the_str) tokens_list.append(tokens) # Tag tag_data, tag_maps = tagger.tag(tokens) tags_list.append([tag_data, tag_maps]) end_time = time() # Debug output if args.debug: message = "\t** Processed '%s' (%u / %u) in %f seconds. **" % ( os.path.basename(path), path_index + 1, len(input_paths), end_time - start_time) print message # Output the CSV. csv_str = formatter.batch_format(tags_list=tags_list, tokens_list=tokens_list, corpus_name=corpus_name, s_list=str_list, text_name_list=text_name_list) # Write the csv_str out to a file. if args.output_filename is None: csv_filename = corpus_name + "_" + args.tagger_module_name + ".csv" else: csv_filename = args.output_filename # Do we have a specified output directory in the args object? if args.output_dir is not None: csv_dir = os.path.abspath(os.path.expanduser(args.output_dir)) else: # Output the CSV in the current working directory by default. csv_dir = os.path.abspath(os.path.dirname(__file__)) # Create the output directory if it doesn't exist. if not os.path.exists(csv_dir): os.makedirs(csv_dir) # Get the full file path to the output CSV. csv_path = os.path.join(csv_dir, csv_filename) # Write the CSV to disk. try: csv_file = codecs.open(csv_path, encoding="utf-8", mode="w") csv_file.write(csv_str) csv_file.close() # Debug output if args.debug: message = "** Wrote CSV containing tagged data for corpus '%s' to '%s'. **" % ( corpus_name, csv_path) print message return csv_path except IOError: if args.debug: message = "**** Error writing out CSV containing tagged data for corpus '%s' to '%s'. ****" % ( corpus_name, csv_path) print message return None
def tag_text(text_path, corpus_info, corpus_data_files, tags, formats=None, write_to_disk=False): # Open the text file and get its contents. if not os.path.exists(text_path): raise ValueError("Text file '%s' does not exist." % text_path) text_name = os.path.basename(text_path) text_file = codecs.open(text_path, encoding="UTF-8") text_contents = text_file.read() text_file.close() # Tokenize. tokenizer = RegexTokenizer() tokens = tokenizer.tokenize(text_contents) # Import and instantiate the taggers. tag_dicts = {} tag_maps = {} # TODO: Parallelize? for tag_name, tag_args in tags.items(): if tag_name in tag_dicts or tag_name in tag_maps: raise NotImplementedError( "Tagging multiple times with the same tagger is not yet supported." ) tagger_name = tag_name + "Tagger" tagger_module = getattr( __import__("Ity.Taggers", fromlist=tagger_name), tagger_name) # Add some additional instantiation arguments for specific taggers. # TODO: Clean up Taggers' init() arguments. if tag_args is None: tagger_init_args = {} else: tagger_init_args = tag_args # Optionally use the rules file that was uploaded with the if tag_name == "SimpleRule" and ( "SimpleRule" in corpus_data_files and "saved" in corpus_data_files["SimpleRule"] and len(corpus_data_files["SimpleRule"]["saved"]) > 0): if "rules_filename" not in tagger_init_args: if len(corpus_data_files["SimpleRule"]["saved"]) > 1: raise NotImplementedError( "Multiple rules files for SimpleRuleTagger is not yet supported." ) tagger_init_args.update( rules_filename=corpus_data_files["SimpleRule"]["saved"][0]) # Otherwise, SimpleRuleTagger will use the default rules file it knows the path to internally. elif tag_name == "TopicModel": tagger_init_args.update(corpus_name=corpus_info["name"]) # Instantiate this tagger. tagger_instance = tagger_module(**tagger_init_args) # Tag with this tagger. single_tag_data, single_tag_maps = tagger_instance.tag(tokens) tag_dicts[tag_name] = single_tag_data tag_maps[tag_name] = single_tag_maps # Return the text name, list of tag dicts, and some token counts. output_dict = dict( text_path=text_path, text_name=text_name, text_key=nameToKey(os.path.splitext(text_name)[0]), corpus_name=corpus_info["name"], text_contents=text_contents, # tokens=tokens, tag_dicts=tag_dicts, # tags=tags, num_tokens=len(tokens), num_word_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["WORD"] ]), num_punctuation_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["PUNCTUATION"] ]), num_included_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] not in tokenizer.excluded_token_types ]), num_excluded_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] in tokenizer.excluded_token_types ])) if formats is not None: format_outputs = format_text(tag_maps, tokens, output_dict, corpus_info, formats, write_to_disk=write_to_disk) output_dict["format_outputs"] = format_outputs output_dict["html_name"] = os.path.basename( format_outputs["HTML"]["app"]) # del output_dict["tags"] return output_dict
def _tag_text_with_existing_instances(text_path, corpus_info, corpus_data_files, taggers, formatters=None, write_to_disk=False): # Open the text file and get its contents. if not os.path.exists(text_path): raise ValueError("Text file '%s' does not exist." % text_path) text_name = os.path.basename(text_path) # Try to decode the file with multiple encodings text_file = None text_contents = None for encoding in ["UTF-8", "ISO-8859-1", "CP1252"]: try: text_file = codecs.open(text_path, encoding=encoding) text_contents = text_file.read() break except UnicodeDecodeError: pass finally: if text_file is not None: text_file.close() if text_contents is None: raise NotImplementedError( "Could not find a valid encoding for input file %s" % text_path) # Tokenize. tokenizer = RegexTokenizer() tokens = tokenizer.tokenize(text_contents) # Import and instantiate the taggers. tag_dicts = {} tag_maps = {} # TODO: Parallelize? for tag_name, tagger in taggers.items(): if tag_name in tag_dicts or tag_name in tag_maps: raise NotImplementedError( "Tagging multiple times with the same tagger is not yet supported." ) # Tag with this tagger. single_tag_data, single_tag_maps = tagger.tag(tokens) tag_dicts[tag_name] = single_tag_data tag_maps[tag_name] = single_tag_maps # Return the text name, list of tag dicts, and some token counts. output_dict = dict( text_path=text_path, text_name=text_name, text_key=nameToKey(os.path.splitext(text_name)[0]), corpus_name=corpus_info["name"], text_contents=text_contents, # tokens=tokens, tag_dicts=tag_dicts, # tags=tags, num_tokens=len(tokens), num_word_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["WORD"] ]), num_punctuation_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["PUNCTUATION"] ]), num_included_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] not in tokenizer.excluded_token_types ]), num_excluded_tokens=len([ token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]] in tokenizer.excluded_token_types ])) if formatters is not None: format_outputs = _format_text_with_existing_instances( tag_maps, tokens, output_dict, corpus_info, formatters, write_to_disk=write_to_disk) output_dict["format_outputs"] = format_outputs output_dict["html_name"] = os.path.basename( format_outputs["HTML"]["app"]) # del output_dict["tags"] return output_dict