Exemplo n.º 1
0
def format_ds(input_file):
    """Reads the file at the path pointed at by input_file and returns Docuscope-formatted results from the Ity
    DocuscopeTagger, in string form"""
    with open(input_file, 'r') as f:
        text_contents = f.read()
        tokenizer = RegexTokenizer()
        tokens = tokenizer.tokenize(text_contents)
        tagger = DocuscopeTagger(return_included_tags=True)
        tags = tagger.tag(tokens)
        # do an ugly hack to fix lat names
        for t in tags[1]:
            new_tag = list(t['rules'][0])
            new_tag[0] = new_tag[0].rsplit('.')[-1]
            new_rules = list(t['rules'])
            new_rules.pop(0)
            new_rules.insert(0, new_tag)
            t['rules'] = tuple(new_rules)
        formatter = LATFormatter.LATFormatter()
        return formatter.format(tags=tags, tokens=tokens, s=text_contents, input_file=input_file)
Exemplo n.º 2
0
def format_ds(input_file):
    """Reads the file at the path pointed at by input_file and returns Docuscope-formatted results from the Ity
    DocuscopeTagger, in string form"""
    with open(input_file, 'r') as f:
        text_contents = f.read()
        tokenizer = RegexTokenizer()
        tokens = tokenizer.tokenize(text_contents)
        tagger = DocuscopeTagger(return_included_tags=True)
        tags = tagger.tag(tokens)
        # do an ugly hack to fix lat names
        for t in tags[1]:
            new_tag = list(t['rules'][0])
            new_tag[0] = new_tag[0].rsplit('.')[-1]
            new_rules = list(t['rules'])
            new_rules.pop(0)
            new_rules.insert(0, new_tag)
            t['rules'] = tuple(new_rules)
        formatter = LATFormatter.LATFormatter()
        return formatter.format(tags=tags,
                                tokens=tokens,
                                s=text_contents,
                                input_file=input_file)
Exemplo n.º 3
0
def ngramCorpus(args):
    corpus_path = args.corpus_path
    if not os.path.exists(corpus_path):
        raise ValueError("Invalid input corpus input path.", corpus_path, "does not exist on disk.")
    ncount = int(args.ngram_count)
    if ncount > 3 or ncount < 1:
        raise ValueError("Invalid parameter: ngram count must be between 1 and 3.")
    #instantiate tokenizer
    tokenizer = RegexTokenizer()
    tokens = []
    bad_files = []
    #traverse files and tokenize
    documentNgramCounts = defaultdict(int) # to count number of documents ngrams appear in
    corpusNgramCounts = defaultdict(int)
    per_doc_path = None
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    if args.per_doc:
        per_doc_path = os.path.join(args.output_dir, 'perDocNgrams')
        if not os.path.exists(per_doc_path):
            os.mkdir(per_doc_path)
    c = 0
    for dirpath, subdirs, files in os.walk(corpus_path):
        for file in files:
            if '.txt' in file:
                print(c)
                c+=1
                filepath = os.path.join(dirpath, file)
                try:
                    #tokenize
                    tokens = tasks.tokenizeText(filepath, False, None, None, tokenizer)
                    #process out punctuation
                    tokens = tasks.ngramProcess(tokens, args.ngram_pun)
                    # update corpus dictionaries
                    docCounts = tasks.ngramUpdate(tokens, documentNgramCounts, corpusNgramCounts, ncount, args.ngram_pun)
                    if args.per_doc:
                        docName = os.path.splitext(os.path.basename(filepath))[0]
                        ngramCSV(documentNgramCounts=None, corpusNgramCounts=docCounts, maxN=ncount, output_dir=per_doc_path, name=docName, doc=True)
                except NotImplementedError:
                    bad_files.append(filepath)
    ngramCSV(documentNgramCounts=documentNgramCounts, corpusNgramCounts=corpusNgramCounts, maxN=ncount, output_dir=args.output_dir, name=os.path.basename(corpus_path), doc=False)
    print("Completed ngram processing of corpus " + os.path.basename(corpus_path))
    if bad_files != []:
        print("Unable to ngram the following files" + str(bad_files))
Exemplo n.º 4
0
def tag_text(text_path, corpus_info, corpus_data_files, tags, formats=None, write_to_disk=False):
    # Open the text file and get its contents.
    if not os.path.exists(text_path):
        raise ValueError("Text file '%s' does not exist." % text_path)
    text_name = os.path.basename(text_path)
    text_file = codecs.open(text_path, encoding="UTF-8")
    text_contents = text_file.read()
    text_file.close()

    # Tokenize.
    tokenizer = RegexTokenizer()
    tokens = tokenizer.tokenize(text_contents)

    # Import and instantiate the taggers.
    tag_dicts = {}
    tag_maps = {}
    # TODO: Parallelize?
    for tag_name, tag_args in tags.items():
        if tag_name in tag_dicts or tag_name in tag_maps:
            raise NotImplementedError("Tagging multiple times with the same tagger is not yet supported.")
        tagger_name = tag_name + "Tagger"
        tagger_module = getattr(__import__("Ity.Taggers", fromlist=tagger_name), tagger_name)
        # Add some additional instantiation arguments for specific taggers.
        # TODO: Clean up Taggers' init() arguments.
        if tag_args is None:
            tagger_init_args = {}
        else:
            tagger_init_args = tag_args
        # Optionally use the rules file that was uploaded with the
        if tag_name == "SimpleRule" and (
            "SimpleRule" in corpus_data_files and
            "saved" in corpus_data_files["SimpleRule"]
            and len(corpus_data_files["SimpleRule"]["saved"]) > 0
        ):
            if "rules_filename" not in tagger_init_args:
                if len(corpus_data_files["SimpleRule"]["saved"]) > 1:
                    raise NotImplementedError("Multiple rules files for SimpleRuleTagger is not yet supported.")
                tagger_init_args.update(
                    rules_filename=corpus_data_files["SimpleRule"]["saved"][0]
                )
            # Otherwise, SimpleRuleTagger will use the default rules file it knows the path to internally.
        elif tag_name == "TopicModel":
            tagger_init_args.update(
                corpus_name=corpus_info["name"]
            )
        # Instantiate this tagger.
        tagger_instance = tagger_module(**tagger_init_args)
        # Tag with this tagger.
        single_tag_data, single_tag_maps = tagger_instance.tag(tokens)
        tag_dicts[tag_name] = single_tag_data
        tag_maps[tag_name] = single_tag_maps

    # Return the text name, list of tag dicts, and some token counts.
    output_dict = dict(
        text_path=text_path,
        text_name=text_name,
        text_key=nameToKey(os.path.splitext(text_name)[0]),
        corpus_name=corpus_info["name"],
        text_contents=text_contents,
        # tokens=tokens,
        tag_dicts=tag_dicts,
        # tags=tags,
        num_tokens=len(tokens),
        num_word_tokens=len([
            token for token in tokens
            if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["WORD"]
        ]),
        num_punctuation_tokens=len([
            token for token in tokens
            if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["PUNCTUATION"]
        ]),
        num_included_tokens=len([
            token for token in tokens
            if token[RegexTokenizer.INDEXES["TYPE"]] not in tokenizer.excluded_token_types
        ]),
        num_excluded_tokens=len([
            token for token in tokens
            if token[RegexTokenizer.INDEXES["TYPE"]] in tokenizer.excluded_token_types
        ])
    )
    if formats is not None:
        format_outputs = format_text(tag_maps, tokens, output_dict, corpus_info, formats, write_to_disk=write_to_disk)
        output_dict["format_outputs"] = format_outputs
        output_dict["html_name"] = os.path.basename(format_outputs["HTML"]["app"])
    # del output_dict["tags"]
    return output_dict
Exemplo n.º 5
0
def _tag_text_with_existing_instances(text_path, corpus_info, corpus_data_files, taggers, formatters=None, write_to_disk=False):
    # Open the text file and get its contents.
    if not os.path.exists(text_path):
        raise ValueError("Text file '%s' does not exist." % text_path)
    text_name = os.path.basename(text_path)
    # Try to decode the file with multiple encodings
    text_file = None
    text_contents = None
    for encoding in ["UTF-8", "ISO-8859-1", "CP1252"]:
        try:
            text_file = codecs.open(text_path, encoding=encoding)
            text_contents = text_file.read()
            break
        except UnicodeDecodeError:
            pass
        finally:
            if text_file is not None:
                text_file.close()
    if text_contents is None:
        raise NotImplementedError("Could not find a valid encoding for input file %s" % text_path) 

    # Tokenize.
    tokenizer = RegexTokenizer()
    tokens = tokenizer.tokenize(text_contents)

    # Import and instantiate the taggers.
    tag_dicts = {}
    tag_maps = {}
    # TODO: Parallelize?
    for tag_name, tagger in taggers.items():
        if tag_name in tag_dicts or tag_name in tag_maps:
            raise NotImplementedError("Tagging multiple times with the same tagger is not yet supported.")
        # Tag with this tagger.
        single_tag_data, single_tag_maps = tagger.tag(tokens)
        tag_dicts[tag_name] = single_tag_data
        tag_maps[tag_name] = single_tag_maps

    # Return the text name, list of tag dicts, and some token counts.
    output_dict = dict(
        text_path=text_path,
        text_name=text_name,
        text_key=nameToKey(os.path.splitext(text_name)[0]),
        corpus_name=corpus_info["name"],
        text_contents=text_contents,
        # tokens=tokens,
        tag_dicts=tag_dicts,
        # tags=tags,
        num_tokens=len(tokens),
        num_word_tokens=len([
            token for token in tokens
            if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["WORD"]
        ]),
        num_punctuation_tokens=len([
            token for token in tokens
            if token[RegexTokenizer.INDEXES["TYPE"]] == RegexTokenizer.TYPES["PUNCTUATION"]
        ]),
        num_included_tokens=len([
            token for token in tokens
            if token[RegexTokenizer.INDEXES["TYPE"]] not in tokenizer.excluded_token_types
        ]),
        num_excluded_tokens=len([
            token for token in tokens
            if token[RegexTokenizer.INDEXES["TYPE"]] in tokenizer.excluded_token_types
        ])
    )
    if formatters is not None:
        format_outputs = _format_text_with_existing_instances(tag_maps, tokens, output_dict, corpus_info, formatters, write_to_disk=write_to_disk)
        output_dict["format_outputs"] = format_outputs
        output_dict["html_name"] = os.path.basename(format_outputs["HTML"]["app"])
    # del output_dict["tags"]
    return output_dict
Exemplo n.º 6
0
def csv_formatter_app(args):
    # Get the input files with the appropriate file extension.
    patterns = None
    if args.file_extension is not None:
        patterns = ("\." + args.file_extension + "$",)

    # Figure out which tagger we need.
    imported_tagger = getattr(__import__("Ity.Taggers", fromlist=[args.tagger_module_name]), args.tagger_module_name)

    # Make sure the corpus folder at corpus_path exists.
    # If args.corpus_path is an absolute path, os.path.join() will do the right thing.
    corpus_path = os.path.join(
        corpus_root,
        args.corpus_path
    )
    if not os.path.exists(corpus_path):
        raise ValueError("Corpus at path '%s' does not exist.")

    # TopicModelTagger and a few other things may need this.
    corpus_name = os.path.basename(corpus_path)

    # Filter by file names in the corpus.
    if args.filenames is not None and len(args.filenames) > 0:
        for index, filename in enumerate(args.filenames):
            args.filenames[index] = os.path.join(corpus_path, filename)
        input_paths = FilePaths.valid_paths(args.filenames, patterns=patterns, recursion_levels=3, debug=args.debug)
    else:
        input_paths = FilePaths.valid_paths((corpus_path,), patterns=patterns, recursion_levels=3, debug=args.debug)

    ################################
    #### Initialize Ity Modules ####
    ################################

    tokenizer = RegexTokenizer()
    # Instantiate *one* tagger. Note that TopicModelTagger needs a model_path given to it.
    # TODO: Support for multiple taggers.
    # TODO: Run the TopicModel generator for a brand new corpus for which we have no metadata.
    # TODO: It seems like TopicModelTagger implies some kind of CorpusTagger with corpus-specific data. It'd be good to make that a real subclass.
    if args.tagger_module_name == "TopicModelTagger":
        tagger = imported_tagger(corpus_name=corpus_name)
    # Use the rules filename for SimpleRuleTagger if we got one. Otherwise, SimpleRuleTagger will use the rules in "default.csv".
    elif args.tagger_module_name == "SimpleRuleTagger" and args.rules_file is not None:
        tagger = imported_tagger(rules_filename=args.rules_file)
    else:
        tagger = imported_tagger()
    formatter = CSVFormatter()

    # Keep calm and DO THINGS
    tags_list = []
    tokens_list = []
    str_list = []
    text_name_list = []

    # Process each text in the corpus.
    for path_index, path in enumerate(input_paths):
        # Get the name of the text. That appears as output in the CSV.
        text_name = os.path.splitext(os.path.basename(path))[0]
        text_name_list.append(text_name)

        start_time = time()

        # Open the file and get its contents.
        the_file = codecs.open(path, encoding="utf-8")
        the_str = the_file.read()
        the_file.close()
        str_list.append(the_str)

        # Tokenize
        tokens = tokenizer.tokenize(the_str)
        tokens_list.append(tokens)

        # Tag
        tag_data, tag_maps = tagger.tag(tokens)
        tags_list.append([tag_data, tag_maps])

        end_time = time()

        # Debug output
        if args.debug:
            message = "\t** Processed '%s' (%u / %u) in %f seconds. **" % (
                os.path.basename(path),
                path_index + 1,
                len(input_paths),
                end_time - start_time
            )
            print message

    # Output the CSV.
    csv_str = formatter.batch_format(
        tags_list=tags_list,
        tokens_list=tokens_list,
        corpus_name=corpus_name,
        s_list=str_list,
        text_name_list=text_name_list
    )
    # Write the csv_str out to a file.
    if args.output_filename is None:
        csv_filename = corpus_name + "_" + args.tagger_module_name + ".csv"
    else:
        csv_filename = args.output_filename
    # Do we have a specified output directory in the args object?
    if args.output_dir is not None:
        csv_dir = os.path.abspath(
            os.path.expanduser(args.output_dir)
        )
    else:
        # Output the CSV in the current working directory by default.
        csv_dir = os.path.abspath(os.path.dirname(__file__))
    # Create the output directory if it doesn't exist.
    if not os.path.exists(csv_dir):
        os.makedirs(csv_dir)
    # Get the full file path to the output CSV.
    csv_path = os.path.join(
        csv_dir,
        csv_filename
    )
    # Write the CSV to disk.
    try:
        csv_file = codecs.open(csv_path, encoding="utf-8", mode="w")
        csv_file.write(csv_str)
        csv_file.close()
        # Debug output
        if args.debug:
            message = "** Wrote CSV containing tagged data for corpus '%s' to '%s'. **" % (corpus_name, csv_path)
            print message
        return csv_path
    except IOError:
        if args.debug:
            message = "**** Error writing out CSV containing tagged data for corpus '%s' to '%s'. ****" % (corpus_name, csv_path)
            print message
        return None
Exemplo n.º 7
0
def tag_corpus(corpus_info,
               corpus_data_files,
               email='',
               tags=ImmutableDict(Docuscope={
                   "return_included_tags": True,
                   "return_excluded_tags": False
               }),
               create_zip_archive=False,
               ngram_count=0,
               ngram_pun=False,
               ngram_per_doc=False,
               chunk_text=False,
               chunk_length=None,
               chunk_offset=None,
               blacklist_path=None,
               blacklist_words='',
               rule_csv=False,
               doc_rule=False,
               defect_count=False,
               name='',
               app_mode=False,
               current_task=None,
               logger=None,
               token_csv=False,
               verbose=True,
               includeTagViewer=False):
    print('Starting tag_corpus...')
    tag_corpus_start = time()
    timing = []

    # Validate corpus_info.
    if "path" not in corpus_info or "name" not in corpus_info or "data" not in corpus_info:
        raise ValueError("Invalid corpus_info provided.")

    # Validate parameters
    if chunk_text:
        if chunk_length is None:
            chunk_length = 2000  #default chunk size
        else:
            chunk_length = int(chunk_length)
        if chunk_offset is None:
            chunk_offset = chunk_length  #default offset is chunk length
        else:
            chunk_offset = int(chunk_offset)
        if chunk_length < chunk_offset:
            raise ValueError(
                "Invalid chunking parameters: chunk_offset must be <= chunk_length."
            )
    else:
        if chunk_length is not None or chunk_offset is not None:
            raise ValueError(
                "Text chunking must be enabled to set chunk_length or chunk_offset."
            )

    if ngram_count is None and ngram_pun == True:
        raise ValueError(
            "Ngrams must be enabled to set ngram punctuation count.")

    if ngram_count is None:
        ngram_count = 0

    if int(ngram_count) < 0 or int(ngram_count) > 3:
        raise ValueError("Ngram count must be between 1 and 3.")
    else:
        ngram_count = int(ngram_count)

    if name != '':  #add a hyphen to match output naming scheme
        name = name + "-"

    if doc_rule and not rule_csv:
        raise ValueError(
            "Must enable rule counting to enable per document rule information."
        )

    if chunk_text and rule_csv:
        raise ValueError(
            'Rule counting and chunking cannot be performed simultaneously.')

    if chunk_text and token_csv:
        raise ValueError(
            'Token csvs and chunking cannot be performed simultaneously.')

    # Validate blacklist params and retrieve blacklist words
    blacklist = []
    if blacklist_path is not None:
        if not os.path.exists(
                blacklist_path) or blacklist_path.endswith('.txt') is False:
            raise ValueError(
                "Blacklist text file '%s' does not exist. Please supply a valid space-separated .txt file."
                % blacklist_path)
        else:
            try:
                f = open(blacklist_path)
                for line in f:
                    words = line.split()
                    blacklist.extend(words)
            except IOError:
                raise ValueError(
                    "Unable to open blacklist file %s. Please supply a valid space-separated .txt file."
                    % blacklist_path)

    # Or, retrieve blacklisted words from GUI
    elif blacklist_words is not None:
        blacklist = str(blacklist_words).split()
    # Add an id to the corpus_info dict.
    corpus_info["processing_id"] = "".join([
        corpus_info["name"], "_", "-".join(tags.keys()), "_",
        socket.gethostname(), "_",
        str(int(time()))
    ])

    # Validate Taggers.
    tagger_instances = {}
    if len(tags) > 1:
        raise ValueError(
            "Tagging texts with multiple taggers isn't supported yet.")
    for tag_name in tags.keys():
        try:
            __import__("Ity.Taggers." + tag_name + "Tagger")
        except:
            raise ValueError("A Tagger module for '%s' tags does not exist." %
                             tag_name)
    is_docuscope = True

    # Instantiate Taggers.
    start = time()
    for tag_name, tag_args in tags.items():
        tagger_name = tag_name + "Tagger"
        tagger_module = getattr(
            __import__("Ity.Taggers", fromlist=tagger_name), tagger_name)
        # Add some additional instantiation arguments for specific taggers.
        # TODO: Clean up Taggers' init() arguments.
        if tag_args is None:
            tagger_init_args = {}
        else:
            tagger_init_args = tag_args
        # custom rules file
        if tag_name == "Docuscope" and (
                "SimpleRule" in corpus_data_files
                and "saved" in corpus_data_files["SimpleRule"]
                and len(corpus_data_files["SimpleRule"]["saved"]) > 0):
            is_docuscope = False
            tagger_init_args.update(
                dictionary_path=corpus_data_files["SimpleRule"]["saved"][0],
                return_untagged_tags=True,
                return_unrecognized_tags=True,
                blacklist=blacklist)
        else:
            tagger_init_args.update(
                return_untagged_tags=True,
                return_unrecognized_tags=True,
                return_excluded_tags=
                False,  # prevents display/tagging of whitespace
                return_included_tags=True,
                blacklist=blacklist,
            )
        # Instantiate this Tagger.
        # optimization: detailed tag data isn't required UNLESS we generate tag-level rule statistics
        if rule_csv or token_csv:
            tagger_init_args.update(return_tag_maps=True)

        tagger_instance = tagger_module(**tagger_init_args)
        tagger_instances[tag_name] = tagger_instance
    timing.append(('Instantiate Taggers', time() - start))

    # Get all the texts in this corpus...if there are any?
    if "Text" not in corpus_info["data"] or len(
            corpus_data_files["Text"]["saved"]) == 0:
        raise StandardError("No corpus texts to tag!")
    text_paths = corpus_data_files["Text"]["saved"]

    if app_mode:
        # Logging
        logger.info("Email: %s; Corpus: %s; # Texts: %u." %
                    (email, corpus_info["name"],
                     len(corpus_data_files["Text"]["saved"])))

        # Update progress.
        if current_task.request.id is not None:
            current_task.update_state(state='PROGRESS',
                                      meta={
                                          'current': 0.0,
                                          'total': 100.0,
                                          "model_path": corpus_info["name"]
                                      })

    # initialize primary csv
    csv_path = getCSVPath(corpus_info, name, 'gen')
    lats = sorted(tagger_instances['Docuscope']._ds_dict.lats)
    header_keys = getHeaderKeys(lats, is_docuscope, defect_count)
    u = open(csv_path, 'wb')
    uwriter = csv.writer(u, delimiter=',', quoting=csv.QUOTE_MINIMAL)
    uwriter.writerow(header_keys)

    #initialize rule dict
    corpus_map = dict()
    if ngram_count > 0:
        documentNgramCounts = defaultdict(
            int)  # to count number of documents ngrams appear in
        corpusNgramCounts = defaultdict(int)

    start = time()
    tokenizer = RegexTokenizer()
    timing.append(('Tokenizer Initialization Time: ', time() - start))

    # tag each text
    tag_start = time()
    index = 0
    tokenization_time = 0
    tagging_time = 0

    # initialize unreadable files list
    bad_texts = []

    # gleicher, summer 2021 - make this an enumerate to keep track of how much we've done
    # just for printing
    for ct, text_path in enumerate(text_paths):
        if verbose:
            print("file {} of {}".format(ct, len(text_paths)))
        # tokenize
        start = time()
        try:
            tokens = tokenizeText(text_path, defect_count, chunk_length,
                                  chunk_offset, tokenizer)
        # skip texts that can't be tokenized
        except IOError:
            bad_texts.append(text_path)
            continue
        if token_csv:
            token_frame = TokenTransform.transformToFrame(tokens)

        tokenization_time += (time() - start)

        start = time()
        result = tagText(tagger_instance,
                         tag_name,
                         text_path,
                         tokens,
                         corpus_info,
                         chunk=chunk_text,
                         rule_csv=rule_csv,
                         token_csv=token_csv)
        if token_csv:
            tagged_frame = TokenTransform.tagFrameMerge(token_frame, result)
            result["token_csv_name"] = result[
                'text_key'] + '-ubiq-tokens' + '.csv'
            # gleicher - change file names to match the input file names
            # filename = result['text_key']
            filename = os.path.splitext(result['text_name'])[0]
            tokenCSVPath = getCSVPath(corpus_info,
                                      name,
                                      type='token_csv',
                                      docName=filename)
            tagged_frame.to_csv(tokenCSVPath,
                                index=False,
                                header=False,
                                encoding='utf-8')
        else:
            if chunk_text:
                for r in result:
                    r["token_csv_name"] = ""
            else:
                result["token_csv_name"] = ""
        tagging_time += (time() - start)

        # iterate through tokens (or sub-lists of tokens) and calculate token level statistics (if necessary)
        # then delete tokens to free up space
        if chunk_text:
            if defect_count:
                for i in range(len(result)):
                    result[i] = defectProcess(result[i], tokens[0][i])
        else:
            if defect_count:
                result = defectProcess(result, tokens)

        if chunk_text:
            tokens = tokens[1]

        if ngram_count > 0:
            ngram_tokens = ngramProcess(tokens, ngram_pun)

        # done with tokens, free up memory
        del tokens

        # write out primary csv
        if chunk_text:
            for text_dict in result:
                row = result_to_gen_row(
                    text_dict,
                    header_keys,
                )
                uwriter.writerow(row)
        else:
            row = result_to_gen_row(result, header_keys)
            uwriter.writerow(row)

        # update corpus dictionaries
        if rule_csv:
            rule_map = result['rule_map']
            updateCorpusCounts(rule_map, corpus_map)
            if doc_rule:
                perDocRuleCSV(
                    corpus_info, result['text_key'], rule_map
                )  # generate PER document CSVs all in one method (they all need separate writers anyway)
        del result
        if ngram_count > 0:
            docCounts = ngramUpdate(ngram_tokens, documentNgramCounts,
                                    corpusNgramCounts, ngram_count, ngram_pun)
            if ngram_per_doc:
                docName = os.path.splitext(os.path.basename(text_path))[0]
                ngramCSV(documentNgramCounts=None,
                         corpusNgramCounts=docCounts,
                         maxN=ngram_count,
                         corpus_info=corpus_info,
                         name=name,
                         docName=docName)
        if app_mode:
            if current_task.request.id is not None:
                current_task.update_state(state='PROGRESS',
                                          meta={
                                              'current':
                                              float(index + 1) /
                                              len(text_paths) * 100.0,
                                              'total':
                                              100.0
                                          })
        index = index + 1

    u.close()

    timing.append(('Total Tokenization', tokenization_time))
    timing.append(('Total Tagging', tagging_time))
    # write out corpus-wide rule CSV (if applicable)
    if rule_csv:
        ruleCSV(corpus_info, name, corpus_map)

    if ngram_count > 0:
        ngramCSV(documentNgramCounts, corpusNgramCounts, ngram_count,
                 corpus_info, name)

    frame = inspect.currentframe()
    tc_args, _, _, tc_values = inspect.getargvalues(frame)
    buildReadme(tc_args, tc_values, timing, version, blacklist, bad_texts)

    if app_mode:
        # Update progress. (Web version)
        if current_task.request.id is not None:
            current_task.update_state(state='PROGRESS',
                                      meta={
                                          'current': 100.0,
                                          'total': 100.0
                                      })

    if token_csv:
        TVpath = os.path.join(corpus_info["output_path"],
                              corpus_info["provenance"], 'TextViewer.html')
        print(TVpath)
        if includeTagViewer:
            shutil.copyfile('TextViewer.html', TVpath)
    print('tag_corpus finished. Total elapsed time: %.2f seconds.' %
          (time() - tag_corpus_start))

    return csv_path
Exemplo n.º 8
0
def csv_formatter_app(args):
    # Get the input files with the appropriate file extension.
    patterns = None
    if args.file_extension is not None:
        patterns = ("\." + args.file_extension + "$", )

    # Figure out which tagger we need.
    imported_tagger = getattr(
        __import__("Ity.Taggers", fromlist=[args.tagger_module_name]),
        args.tagger_module_name)

    # Make sure the corpus folder at corpus_path exists.
    # If args.corpus_path is an absolute path, os.path.join() will do the right thing.
    corpus_path = os.path.join(corpus_root, args.corpus_path)
    if not os.path.exists(corpus_path):
        raise ValueError("Corpus at path '%s' does not exist.")

    # TopicModelTagger and a few other things may need this.
    corpus_name = os.path.basename(corpus_path)

    # Filter by file names in the corpus.
    if args.filenames is not None and len(args.filenames) > 0:
        for index, filename in enumerate(args.filenames):
            args.filenames[index] = os.path.join(corpus_path, filename)
        input_paths = FilePaths.valid_paths(args.filenames,
                                            patterns=patterns,
                                            recursion_levels=3,
                                            debug=args.debug)
    else:
        input_paths = FilePaths.valid_paths((corpus_path, ),
                                            patterns=patterns,
                                            recursion_levels=3,
                                            debug=args.debug)

    ################################
    #### Initialize Ity Modules ####
    ################################

    tokenizer = RegexTokenizer()
    # Instantiate *one* tagger. Note that TopicModelTagger needs a model_path given to it.
    # TODO: Support for multiple taggers.
    # TODO: Run the TopicModel generator for a brand new corpus for which we have no metadata.
    # TODO: It seems like TopicModelTagger implies some kind of CorpusTagger with corpus-specific data. It'd be good to make that a real subclass.
    if args.tagger_module_name == "TopicModelTagger":
        tagger = imported_tagger(corpus_name=corpus_name)
    # Use the rules filename for SimpleRuleTagger if we got one. Otherwise, SimpleRuleTagger will use the rules in "default.csv".
    elif args.tagger_module_name == "SimpleRuleTagger" and args.rules_file is not None:
        tagger = imported_tagger(rules_filename=args.rules_file)
    else:
        tagger = imported_tagger()
    formatter = CSVFormatter()

    # Keep calm and DO THINGS
    tags_list = []
    tokens_list = []
    str_list = []
    text_name_list = []

    # Process each text in the corpus.
    for path_index, path in enumerate(input_paths):
        # Get the name of the text. That appears as output in the CSV.
        text_name = os.path.splitext(os.path.basename(path))[0]
        text_name_list.append(text_name)

        start_time = time()

        # Open the file and get its contents.
        the_file = codecs.open(path, encoding="utf-8")
        the_str = the_file.read()
        the_file.close()
        str_list.append(the_str)

        # Tokenize
        tokens = tokenizer.tokenize(the_str)
        tokens_list.append(tokens)

        # Tag
        tag_data, tag_maps = tagger.tag(tokens)
        tags_list.append([tag_data, tag_maps])

        end_time = time()

        # Debug output
        if args.debug:
            message = "\t** Processed '%s' (%u / %u) in %f seconds. **" % (
                os.path.basename(path), path_index + 1, len(input_paths),
                end_time - start_time)
            print message

    # Output the CSV.
    csv_str = formatter.batch_format(tags_list=tags_list,
                                     tokens_list=tokens_list,
                                     corpus_name=corpus_name,
                                     s_list=str_list,
                                     text_name_list=text_name_list)
    # Write the csv_str out to a file.
    if args.output_filename is None:
        csv_filename = corpus_name + "_" + args.tagger_module_name + ".csv"
    else:
        csv_filename = args.output_filename
    # Do we have a specified output directory in the args object?
    if args.output_dir is not None:
        csv_dir = os.path.abspath(os.path.expanduser(args.output_dir))
    else:
        # Output the CSV in the current working directory by default.
        csv_dir = os.path.abspath(os.path.dirname(__file__))
    # Create the output directory if it doesn't exist.
    if not os.path.exists(csv_dir):
        os.makedirs(csv_dir)
    # Get the full file path to the output CSV.
    csv_path = os.path.join(csv_dir, csv_filename)
    # Write the CSV to disk.
    try:
        csv_file = codecs.open(csv_path, encoding="utf-8", mode="w")
        csv_file.write(csv_str)
        csv_file.close()
        # Debug output
        if args.debug:
            message = "** Wrote CSV containing tagged data for corpus '%s' to '%s'. **" % (
                corpus_name, csv_path)
            print message
        return csv_path
    except IOError:
        if args.debug:
            message = "**** Error writing out CSV containing tagged data for corpus '%s' to '%s'. ****" % (
                corpus_name, csv_path)
            print message
        return None
Exemplo n.º 9
0
def tag_text(text_path,
             corpus_info,
             corpus_data_files,
             tags,
             formats=None,
             write_to_disk=False):
    # Open the text file and get its contents.
    if not os.path.exists(text_path):
        raise ValueError("Text file '%s' does not exist." % text_path)
    text_name = os.path.basename(text_path)
    text_file = codecs.open(text_path, encoding="UTF-8")
    text_contents = text_file.read()
    text_file.close()

    # Tokenize.
    tokenizer = RegexTokenizer()
    tokens = tokenizer.tokenize(text_contents)

    # Import and instantiate the taggers.
    tag_dicts = {}
    tag_maps = {}
    # TODO: Parallelize?
    for tag_name, tag_args in tags.items():
        if tag_name in tag_dicts or tag_name in tag_maps:
            raise NotImplementedError(
                "Tagging multiple times with the same tagger is not yet supported."
            )
        tagger_name = tag_name + "Tagger"
        tagger_module = getattr(
            __import__("Ity.Taggers", fromlist=tagger_name), tagger_name)
        # Add some additional instantiation arguments for specific taggers.
        # TODO: Clean up Taggers' init() arguments.
        if tag_args is None:
            tagger_init_args = {}
        else:
            tagger_init_args = tag_args
        # Optionally use the rules file that was uploaded with the
        if tag_name == "SimpleRule" and (
                "SimpleRule" in corpus_data_files
                and "saved" in corpus_data_files["SimpleRule"]
                and len(corpus_data_files["SimpleRule"]["saved"]) > 0):
            if "rules_filename" not in tagger_init_args:
                if len(corpus_data_files["SimpleRule"]["saved"]) > 1:
                    raise NotImplementedError(
                        "Multiple rules files for SimpleRuleTagger is not yet supported."
                    )
                tagger_init_args.update(
                    rules_filename=corpus_data_files["SimpleRule"]["saved"][0])
            # Otherwise, SimpleRuleTagger will use the default rules file it knows the path to internally.
        elif tag_name == "TopicModel":
            tagger_init_args.update(corpus_name=corpus_info["name"])
        # Instantiate this tagger.
        tagger_instance = tagger_module(**tagger_init_args)
        # Tag with this tagger.
        single_tag_data, single_tag_maps = tagger_instance.tag(tokens)
        tag_dicts[tag_name] = single_tag_data
        tag_maps[tag_name] = single_tag_maps

    # Return the text name, list of tag dicts, and some token counts.
    output_dict = dict(
        text_path=text_path,
        text_name=text_name,
        text_key=nameToKey(os.path.splitext(text_name)[0]),
        corpus_name=corpus_info["name"],
        text_contents=text_contents,
        # tokens=tokens,
        tag_dicts=tag_dicts,
        # tags=tags,
        num_tokens=len(tokens),
        num_word_tokens=len([
            token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]]
            == RegexTokenizer.TYPES["WORD"]
        ]),
        num_punctuation_tokens=len([
            token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]]
            == RegexTokenizer.TYPES["PUNCTUATION"]
        ]),
        num_included_tokens=len([
            token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]]
            not in tokenizer.excluded_token_types
        ]),
        num_excluded_tokens=len([
            token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]]
            in tokenizer.excluded_token_types
        ]))
    if formats is not None:
        format_outputs = format_text(tag_maps,
                                     tokens,
                                     output_dict,
                                     corpus_info,
                                     formats,
                                     write_to_disk=write_to_disk)
        output_dict["format_outputs"] = format_outputs
        output_dict["html_name"] = os.path.basename(
            format_outputs["HTML"]["app"])
    # del output_dict["tags"]
    return output_dict
Exemplo n.º 10
0
def _tag_text_with_existing_instances(text_path,
                                      corpus_info,
                                      corpus_data_files,
                                      taggers,
                                      formatters=None,
                                      write_to_disk=False):
    # Open the text file and get its contents.
    if not os.path.exists(text_path):
        raise ValueError("Text file '%s' does not exist." % text_path)
    text_name = os.path.basename(text_path)
    # Try to decode the file with multiple encodings
    text_file = None
    text_contents = None
    for encoding in ["UTF-8", "ISO-8859-1", "CP1252"]:
        try:
            text_file = codecs.open(text_path, encoding=encoding)
            text_contents = text_file.read()
            break
        except UnicodeDecodeError:
            pass
        finally:
            if text_file is not None:
                text_file.close()
    if text_contents is None:
        raise NotImplementedError(
            "Could not find a valid encoding for input file %s" % text_path)

    # Tokenize.
    tokenizer = RegexTokenizer()
    tokens = tokenizer.tokenize(text_contents)

    # Import and instantiate the taggers.
    tag_dicts = {}
    tag_maps = {}
    # TODO: Parallelize?
    for tag_name, tagger in taggers.items():
        if tag_name in tag_dicts or tag_name in tag_maps:
            raise NotImplementedError(
                "Tagging multiple times with the same tagger is not yet supported."
            )
        # Tag with this tagger.
        single_tag_data, single_tag_maps = tagger.tag(tokens)
        tag_dicts[tag_name] = single_tag_data
        tag_maps[tag_name] = single_tag_maps

    # Return the text name, list of tag dicts, and some token counts.
    output_dict = dict(
        text_path=text_path,
        text_name=text_name,
        text_key=nameToKey(os.path.splitext(text_name)[0]),
        corpus_name=corpus_info["name"],
        text_contents=text_contents,
        # tokens=tokens,
        tag_dicts=tag_dicts,
        # tags=tags,
        num_tokens=len(tokens),
        num_word_tokens=len([
            token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]]
            == RegexTokenizer.TYPES["WORD"]
        ]),
        num_punctuation_tokens=len([
            token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]]
            == RegexTokenizer.TYPES["PUNCTUATION"]
        ]),
        num_included_tokens=len([
            token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]]
            not in tokenizer.excluded_token_types
        ]),
        num_excluded_tokens=len([
            token for token in tokens if token[RegexTokenizer.INDEXES["TYPE"]]
            in tokenizer.excluded_token_types
        ]))
    if formatters is not None:
        format_outputs = _format_text_with_existing_instances(
            tag_maps,
            tokens,
            output_dict,
            corpus_info,
            formatters,
            write_to_disk=write_to_disk)
        output_dict["format_outputs"] = format_outputs
        output_dict["html_name"] = os.path.basename(
            format_outputs["HTML"]["app"])
    # del output_dict["tags"]
    return output_dict