Пример #1
0
def format_ds(input_file):
    """Reads the file at the path pointed at by input_file and returns Docuscope-formatted results from the Ity
    DocuscopeTagger, in string form"""
    with open(input_file, 'r') as f:
        text_contents = f.read()
        tokenizer = RegexTokenizer()
        tokens = tokenizer.tokenize(text_contents)
        tagger = DocuscopeTagger(return_included_tags=True)
        tags = tagger.tag(tokens)
        # do an ugly hack to fix lat names
        for t in tags[1]:
            new_tag = list(t['rules'][0])
            new_tag[0] = new_tag[0].rsplit('.')[-1]
            new_rules = list(t['rules'])
            new_rules.pop(0)
            new_rules.insert(0, new_tag)
            t['rules'] = tuple(new_rules)
        formatter = LATFormatter.LATFormatter()
        return formatter.format(tags=tags, tokens=tokens, s=text_contents, input_file=input_file)
Пример #2
0
def format_ds(input_file):
    """Reads the file at the path pointed at by input_file and returns Docuscope-formatted results from the Ity
    DocuscopeTagger, in string form"""
    with open(input_file, 'r') as f:
        text_contents = f.read()
        tokenizer = RegexTokenizer()
        tokens = tokenizer.tokenize(text_contents)
        tagger = DocuscopeTagger(return_included_tags=True)
        tags = tagger.tag(tokens)
        # do an ugly hack to fix lat names
        for t in tags[1]:
            new_tag = list(t['rules'][0])
            new_tag[0] = new_tag[0].rsplit('.')[-1]
            new_rules = list(t['rules'])
            new_rules.pop(0)
            new_rules.insert(0, new_tag)
            t['rules'] = tuple(new_rules)
        formatter = LATFormatter.LATFormatter()
        return formatter.format(tags=tags,
                                tokens=tokens,
                                s=text_contents,
                                input_file=input_file)
Пример #3
0
def tag_text(text_path, corpus_info, corpus_data_files, tags, formats=None, write_to_disk=False):
    # Open the text file and get its contents.
    if not os.path.exists(text_path):
        raise ValueError("Text file '%s' does not exist." % text_path)
    text_name = os.path.basename(text_path)
    text_file = codecs.open(text_path, encoding="UTF-8")
    text_contents = text_file.read()
    text_file.close()

    # Tokenize.
    tokenizer = RegexTokenizer()
    tokens = tokenizer.tokenize(text_contents)

    # Import and instantiate the taggers.
    tag_dicts = {}
    tag_maps = {}
    # TODO: Parallelize?
    for tag_name, tag_args in tags.items():
        if tag_name in tag_dicts or tag_name in tag_maps:
            raise NotImplementedError("Tagging multiple times with the same tagger is not yet supported.")
        tagger_name = tag_name + "Tagger"
        tagger_module = getattr(__import__("Ity.Taggers", fromlist=tagger_name), tagger_name)
        # Add some additional instantiation arguments for specific taggers.
        # TODO: Clean up Taggers' init() arguments.
        if tag_args is None:
            tagger_init_args = {}
        else:
            tagger_init_args = tag_args
        # Optionally use the rules file that was uploaded with the
        if tag_name == "SimpleRule" and (
            "SimpleRule" in corpus_data_files and
            "saved" in corpus_data_files["SimpleRule"]
            and len(corpus_data_files["SimpleRule"]["saved"]) > 0
        ):
            if "rules_filename" not in tagger_init_args:
                if len(corpus_data_files["SimpleRule"]["saved"]) > 1:
                    raise NotImplementedError("Multiple rules files for SimpleRuleTagger is not yet supported.")
                tagger_init_args.update(
                    rules_filename=corpus_data_files["SimpleRule"]["saved"][0]
                )
            # Otherwise, SimpleRuleTagger will use the default rules file it knows the path to internally.
        elif tag_name == "TopicModel":
            tagger_init_args.update(
                corpus_name=corpus_info["name"]
            )
        # Instantiate this tagger.
        tagger_instance = tagger_module(**tagger_init_args)
        # Tag with this tagger.
        single_tag_data, single_tag_maps = tagger_instance.tag(tokens)
        tag_dicts[tag_name] = single_tag_data
        tag_maps[tag_name] = single_tag_maps

    # Return the text name, list of tag dicts, and some token counts.
    output_dict = dict(
        text_path=text_path,
        text_name=text_name,
        text_key=nameToKey(os.path.splitext(text_name)[0]),
        corpus_name=corpus_info["name"],
        text_contents=text_contents,
        # tokens=tokens,
        tag_dicts=tag_dicts,
        # tags=tags,
        num_tokens=len(tokens),
        num_word_tokens=len([
            token for token in tokens
            if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["WORD"]
        ]),
        num_punctuation_tokens=len([
            token for token in tokens
            if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["PUNCTUATION"]
        ]),
        num_included_tokens=len([
            token for token in tokens
            if token[RegexTokenizer.INDEXES["TYPE"]] not in tokenizer.excluded_token_types
        ]),
        num_excluded_tokens=len([
            token for token in tokens
            if token[RegexTokenizer.INDEXES["TYPE"]] in tokenizer.excluded_token_types
        ])
    )
    if formats is not None:
        format_outputs = format_text(tag_maps, tokens, output_dict, corpus_info, formats, write_to_disk=write_to_disk)
        output_dict["format_outputs"] = format_outputs
        output_dict["html_name"] = os.path.basename(format_outputs["HTML"]["app"])
    # del output_dict["tags"]
    return output_dict
Пример #4
0
def _tag_text_with_existing_instances(text_path, corpus_info, corpus_data_files, taggers, formatters=None, write_to_disk=False):
    # Open the text file and get its contents.
    if not os.path.exists(text_path):
        raise ValueError("Text file '%s' does not exist." % text_path)
    text_name = os.path.basename(text_path)
    # Try to decode the file with multiple encodings
    text_file = None
    text_contents = None
    for encoding in ["UTF-8", "ISO-8859-1", "CP1252"]:
        try:
            text_file = codecs.open(text_path, encoding=encoding)
            text_contents = text_file.read()
            break
        except UnicodeDecodeError:
            pass
        finally:
            if text_file is not None:
                text_file.close()
    if text_contents is None:
        raise NotImplementedError("Could not find a valid encoding for input file %s" % text_path) 

    # Tokenize.
    tokenizer = RegexTokenizer()
    tokens = tokenizer.tokenize(text_contents)

    # Import and instantiate the taggers.
    tag_dicts = {}
    tag_maps = {}
    # TODO: Parallelize?
    for tag_name, tagger in taggers.items():
        if tag_name in tag_dicts or tag_name in tag_maps:
            raise NotImplementedError("Tagging multiple times with the same tagger is not yet supported.")
        # Tag with this tagger.
        single_tag_data, single_tag_maps = tagger.tag(tokens)
        tag_dicts[tag_name] = single_tag_data
        tag_maps[tag_name] = single_tag_maps

    # Return the text name, list of tag dicts, and some token counts.
    output_dict = dict(
        text_path=text_path,
        text_name=text_name,
        text_key=nameToKey(os.path.splitext(text_name)[0]),
        corpus_name=corpus_info["name"],
        text_contents=text_contents,
        # tokens=tokens,
        tag_dicts=tag_dicts,
        # tags=tags,
        num_tokens=len(tokens),
        num_word_tokens=len([
            token for token in tokens
            if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["WORD"]
        ]),
        num_punctuation_tokens=len([
            token for token in tokens
            if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["PUNCTUATION"]
        ]),
        num_included_tokens=len([
            token for token in tokens
            if token[RegexTokenizer.INDEXES["TYPE"]] not in tokenizer.excluded_token_types
        ]),
        num_excluded_tokens=len([
            token for token in tokens
            if token[RegexTokenizer.INDEXES["TYPE"]] in tokenizer.excluded_token_types
        ])
    )
    if formatters is not None:
        format_outputs = _format_text_with_existing_instances(tag_maps, tokens, output_dict, corpus_info, formatters, write_to_disk=write_to_disk)
        output_dict["format_outputs"] = format_outputs
        output_dict["html_name"] = os.path.basename(format_outputs["HTML"]["app"])
    # del output_dict["tags"]
    return output_dict
Пример #5
0
def csv_formatter_app(args):
    # Get the input files with the appropriate file extension.
    patterns = None
    if args.file_extension is not None:
        patterns = ("\." + args.file_extension + "$",)

    # Figure out which tagger we need.
    imported_tagger = getattr(__import__("Ity.Taggers", fromlist=[args.tagger_module_name]), args.tagger_module_name)

    # Make sure the corpus folder at corpus_path exists.
    # If args.corpus_path is an absolute path, os.path.join() will do the right thing.
    corpus_path = os.path.join(
        corpus_root,
        args.corpus_path
    )
    if not os.path.exists(corpus_path):
        raise ValueError("Corpus at path '%s' does not exist.")

    # TopicModelTagger and a few other things may need this.
    corpus_name = os.path.basename(corpus_path)

    # Filter by file names in the corpus.
    if args.filenames is not None and len(args.filenames) > 0:
        for index, filename in enumerate(args.filenames):
            args.filenames[index] = os.path.join(corpus_path, filename)
        input_paths = FilePaths.valid_paths(args.filenames, patterns=patterns, recursion_levels=3, debug=args.debug)
    else:
        input_paths = FilePaths.valid_paths((corpus_path,), patterns=patterns, recursion_levels=3, debug=args.debug)

    ################################
    #### Initialize Ity Modules ####
    ################################

    tokenizer = RegexTokenizer()
    # Instantiate *one* tagger. Note that TopicModelTagger needs a model_path given to it.
    # TODO: Support for multiple taggers.
    # TODO: Run the TopicModel generator for a brand new corpus for which we have no metadata.
    # TODO: It seems like TopicModelTagger implies some kind of CorpusTagger with corpus-specific data. It'd be good to make that a real subclass.
    if args.tagger_module_name == "TopicModelTagger":
        tagger = imported_tagger(corpus_name=corpus_name)
    # Use the rules filename for SimpleRuleTagger if we got one. Otherwise, SimpleRuleTagger will use the rules in "default.csv".
    elif args.tagger_module_name == "SimpleRuleTagger" and args.rules_file is not None:
        tagger = imported_tagger(rules_filename=args.rules_file)
    else:
        tagger = imported_tagger()
    formatter = CSVFormatter()

    # Keep calm and DO THINGS
    tags_list = []
    tokens_list = []
    str_list = []
    text_name_list = []

    # Process each text in the corpus.
    for path_index, path in enumerate(input_paths):
        # Get the name of the text. That appears as output in the CSV.
        text_name = os.path.splitext(os.path.basename(path))[0]
        text_name_list.append(text_name)

        start_time = time()

        # Open the file and get its contents.
        the_file = codecs.open(path, encoding="utf-8")
        the_str = the_file.read()
        the_file.close()
        str_list.append(the_str)

        # Tokenize
        tokens = tokenizer.tokenize(the_str)
        tokens_list.append(tokens)

        # Tag
        tag_data, tag_maps = tagger.tag(tokens)
        tags_list.append([tag_data, tag_maps])

        end_time = time()

        # Debug output
        if args.debug:
            message = "\t** Processed '%s' (%u / %u) in %f seconds. **" % (
                os.path.basename(path),
                path_index + 1,
                len(input_paths),
                end_time - start_time
            )
            print message

    # Output the CSV.
    csv_str = formatter.batch_format(
        tags_list=tags_list,
        tokens_list=tokens_list,
        corpus_name=corpus_name,
        s_list=str_list,
        text_name_list=text_name_list
    )
    # Write the csv_str out to a file.
    if args.output_filename is None:
        csv_filename = corpus_name + "_" + args.tagger_module_name + ".csv"
    else:
        csv_filename = args.output_filename
    # Do we have a specified output directory in the args object?
    if args.output_dir is not None:
        csv_dir = os.path.abspath(
            os.path.expanduser(args.output_dir)
        )
    else:
        # Output the CSV in the current working directory by default.
        csv_dir = os.path.abspath(os.path.dirname(__file__))
    # Create the output directory if it doesn't exist.
    if not os.path.exists(csv_dir):
        os.makedirs(csv_dir)
    # Get the full file path to the output CSV.
    csv_path = os.path.join(
        csv_dir,
        csv_filename
    )
    # Write the CSV to disk.
    try:
        csv_file = codecs.open(csv_path, encoding="utf-8", mode="w")
        csv_file.write(csv_str)
        csv_file.close()
        # Debug output
        if args.debug:
            message = "** Wrote CSV containing tagged data for corpus '%s' to '%s'. **" % (corpus_name, csv_path)
            print message
        return csv_path
    except IOError:
        if args.debug:
            message = "**** Error writing out CSV containing tagged data for corpus '%s' to '%s'. ****" % (corpus_name, csv_path)
            print message
        return None
Пример #6
0
def csv_formatter_app(args):
    # Get the input files with the appropriate file extension.
    patterns = None
    if args.file_extension is not None:
        patterns = ("\." + args.file_extension + "$", )

    # Figure out which tagger we need.
    imported_tagger = getattr(
        __import__("Ity.Taggers", fromlist=[args.tagger_module_name]),
        args.tagger_module_name)

    # Make sure the corpus folder at corpus_path exists.
    # If args.corpus_path is an absolute path, os.path.join() will do the right thing.
    corpus_path = os.path.join(corpus_root, args.corpus_path)
    if not os.path.exists(corpus_path):
        raise ValueError("Corpus at path '%s' does not exist.")

    # TopicModelTagger and a few other things may need this.
    corpus_name = os.path.basename(corpus_path)

    # Filter by file names in the corpus.
    if args.filenames is not None and len(args.filenames) > 0:
        for index, filename in enumerate(args.filenames):
            args.filenames[index] = os.path.join(corpus_path, filename)
        input_paths = FilePaths.valid_paths(args.filenames,
                                            patterns=patterns,
                                            recursion_levels=3,
                                            debug=args.debug)
    else:
        input_paths = FilePaths.valid_paths((corpus_path, ),
                                            patterns=patterns,
                                            recursion_levels=3,
                                            debug=args.debug)

    ################################
    #### Initialize Ity Modules ####
    ################################

    tokenizer = RegexTokenizer()
    # Instantiate *one* tagger. Note that TopicModelTagger needs a model_path given to it.
    # TODO: Support for multiple taggers.
    # TODO: Run the TopicModel generator for a brand new corpus for which we have no metadata.
    # TODO: It seems like TopicModelTagger implies some kind of CorpusTagger with corpus-specific data. It'd be good to make that a real subclass.
    if args.tagger_module_name == "TopicModelTagger":
        tagger = imported_tagger(corpus_name=corpus_name)
    # Use the rules filename for SimpleRuleTagger if we got one. Otherwise, SimpleRuleTagger will use the rules in "default.csv".
    elif args.tagger_module_name == "SimpleRuleTagger" and args.rules_file is not None:
        tagger = imported_tagger(rules_filename=args.rules_file)
    else:
        tagger = imported_tagger()
    formatter = CSVFormatter()

    # Keep calm and DO THINGS
    tags_list = []
    tokens_list = []
    str_list = []
    text_name_list = []

    # Process each text in the corpus.
    for path_index, path in enumerate(input_paths):
        # Get the name of the text. That appears as output in the CSV.
        text_name = os.path.splitext(os.path.basename(path))[0]
        text_name_list.append(text_name)

        start_time = time()

        # Open the file and get its contents.
        the_file = codecs.open(path, encoding="utf-8")
        the_str = the_file.read()
        the_file.close()
        str_list.append(the_str)

        # Tokenize
        tokens = tokenizer.tokenize(the_str)
        tokens_list.append(tokens)

        # Tag
        tag_data, tag_maps = tagger.tag(tokens)
        tags_list.append([tag_data, tag_maps])

        end_time = time()

        # Debug output
        if args.debug:
            message = "\t** Processed '%s' (%u / %u) in %f seconds. **" % (
                os.path.basename(path), path_index + 1, len(input_paths),
                end_time - start_time)
            print message

    # Output the CSV.
    csv_str = formatter.batch_format(tags_list=tags_list,
                                     tokens_list=tokens_list,
                                     corpus_name=corpus_name,
                                     s_list=str_list,
                                     text_name_list=text_name_list)
    # Write the csv_str out to a file.
    if args.output_filename is None:
        csv_filename = corpus_name + "_" + args.tagger_module_name + ".csv"
    else:
        csv_filename = args.output_filename
    # Do we have a specified output directory in the args object?
    if args.output_dir is not None:
        csv_dir = os.path.abspath(os.path.expanduser(args.output_dir))
    else:
        # Output the CSV in the current working directory by default.
        csv_dir = os.path.abspath(os.path.dirname(__file__))
    # Create the output directory if it doesn't exist.
    if not os.path.exists(csv_dir):
        os.makedirs(csv_dir)
    # Get the full file path to the output CSV.
    csv_path = os.path.join(csv_dir, csv_filename)
    # Write the CSV to disk.
    try:
        csv_file = codecs.open(csv_path, encoding="utf-8", mode="w")
        csv_file.write(csv_str)
        csv_file.close()
        # Debug output
        if args.debug:
            message = "** Wrote CSV containing tagged data for corpus '%s' to '%s'. **" % (
                corpus_name, csv_path)
            print message
        return csv_path
    except IOError:
        if args.debug:
            message = "**** Error writing out CSV containing tagged data for corpus '%s' to '%s'. ****" % (
                corpus_name, csv_path)
            print message
        return None
Пример #7
0
def tag_text(text_path,
             corpus_info,
             corpus_data_files,
             tags,
             formats=None,
             write_to_disk=False):
    # Open the text file and get its contents.
    if not os.path.exists(text_path):
        raise ValueError("Text file '%s' does not exist." % text_path)
    text_name = os.path.basename(text_path)
    text_file = codecs.open(text_path, encoding="UTF-8")
    text_contents = text_file.read()
    text_file.close()

    # Tokenize.
    tokenizer = RegexTokenizer()
    tokens = tokenizer.tokenize(text_contents)

    # Import and instantiate the taggers.
    tag_dicts = {}
    tag_maps = {}
    # TODO: Parallelize?
    for tag_name, tag_args in tags.items():
        if tag_name in tag_dicts or tag_name in tag_maps:
            raise NotImplementedError(
                "Tagging multiple times with the same tagger is not yet supported."
            )
        tagger_name = tag_name + "Tagger"
        tagger_module = getattr(
            __import__("Ity.Taggers", fromlist=tagger_name), tagger_name)
        # Add some additional instantiation arguments for specific taggers.
        # TODO: Clean up Taggers' init() arguments.
        if tag_args is None:
            tagger_init_args = {}
        else:
            tagger_init_args = tag_args
        # Optionally use the rules file that was uploaded with the
        if tag_name == "SimpleRule" and (
                "SimpleRule" in corpus_data_files
                and "saved" in corpus_data_files["SimpleRule"]
                and len(corpus_data_files["SimpleRule"]["saved"]) > 0):
            if "rules_filename" not in tagger_init_args:
                if len(corpus_data_files["SimpleRule"]["saved"]) > 1:
                    raise NotImplementedError(
                        "Multiple rules files for SimpleRuleTagger is not yet supported."
                    )
                tagger_init_args.update(
                    rules_filename=corpus_data_files["SimpleRule"]["saved"][0])
            # Otherwise, SimpleRuleTagger will use the default rules file it knows the path to internally.
        elif tag_name == "TopicModel":
            tagger_init_args.update(corpus_name=corpus_info["name"])
        # Instantiate this tagger.
        tagger_instance = tagger_module(**tagger_init_args)
        # Tag with this tagger.
        single_tag_data, single_tag_maps = tagger_instance.tag(tokens)
        tag_dicts[tag_name] = single_tag_data
        tag_maps[tag_name] = single_tag_maps

    # Return the text name, list of tag dicts, and some token counts.
    output_dict = dict(
        text_path=text_path,
        text_name=text_name,
        text_key=nameToKey(os.path.splitext(text_name)[0]),
        corpus_name=corpus_info["name"],
        text_contents=text_contents,
        # tokens=tokens,
        tag_dicts=tag_dicts,
        # tags=tags,
        num_tokens=len(tokens),
        num_word_tokens=len([
            token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]]
            == RegexTokenizer.TYPES["WORD"]
        ]),
        num_punctuation_tokens=len([
            token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]]
            == RegexTokenizer.TYPES["PUNCTUATION"]
        ]),
        num_included_tokens=len([
            token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]]
            not in tokenizer.excluded_token_types
        ]),
        num_excluded_tokens=len([
            token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]]
            in tokenizer.excluded_token_types
        ]))
    if formats is not None:
        format_outputs = format_text(tag_maps,
                                     tokens,
                                     output_dict,
                                     corpus_info,
                                     formats,
                                     write_to_disk=write_to_disk)
        output_dict["format_outputs"] = format_outputs
        output_dict["html_name"] = os.path.basename(
            format_outputs["HTML"]["app"])
    # del output_dict["tags"]
    return output_dict
Пример #8
0
def _tag_text_with_existing_instances(text_path,
                                      corpus_info,
                                      corpus_data_files,
                                      taggers,
                                      formatters=None,
                                      write_to_disk=False):
    # Open the text file and get its contents.
    if not os.path.exists(text_path):
        raise ValueError("Text file '%s' does not exist." % text_path)
    text_name = os.path.basename(text_path)
    # Try to decode the file with multiple encodings
    text_file = None
    text_contents = None
    for encoding in ["UTF-8", "ISO-8859-1", "CP1252"]:
        try:
            text_file = codecs.open(text_path, encoding=encoding)
            text_contents = text_file.read()
            break
        except UnicodeDecodeError:
            pass
        finally:
            if text_file is not None:
                text_file.close()
    if text_contents is None:
        raise NotImplementedError(
            "Could not find a valid encoding for input file %s" % text_path)

    # Tokenize.
    tokenizer = RegexTokenizer()
    tokens = tokenizer.tokenize(text_contents)

    # Import and instantiate the taggers.
    tag_dicts = {}
    tag_maps = {}
    # TODO: Parallelize?
    for tag_name, tagger in taggers.items():
        if tag_name in tag_dicts or tag_name in tag_maps:
            raise NotImplementedError(
                "Tagging multiple times with the same tagger is not yet supported."
            )
        # Tag with this tagger.
        single_tag_data, single_tag_maps = tagger.tag(tokens)
        tag_dicts[tag_name] = single_tag_data
        tag_maps[tag_name] = single_tag_maps

    # Return the text name, list of tag dicts, and some token counts.
    output_dict = dict(
        text_path=text_path,
        text_name=text_name,
        text_key=nameToKey(os.path.splitext(text_name)[0]),
        corpus_name=corpus_info["name"],
        text_contents=text_contents,
        # tokens=tokens,
        tag_dicts=tag_dicts,
        # tags=tags,
        num_tokens=len(tokens),
        num_word_tokens=len([
            token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]]
            == RegexTokenizer.TYPES["WORD"]
        ]),
        num_punctuation_tokens=len([
            token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]]
            == RegexTokenizer.TYPES["PUNCTUATION"]
        ]),
        num_included_tokens=len([
            token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]]
            not in tokenizer.excluded_token_types
        ]),
        num_excluded_tokens=len([
            token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]]
            in tokenizer.excluded_token_types
        ]))
    if formatters is not None:
        format_outputs = _format_text_with_existing_instances(
            tag_maps,
            tokens,
            output_dict,
            corpus_info,
            formatters,
            write_to_disk=write_to_disk)
        output_dict["format_outputs"] = format_outputs
        output_dict["html_name"] = os.path.basename(
            format_outputs["HTML"]["app"])
    # del output_dict["tags"]
    return output_dict