Exemplos de NormalizeVolume em Python, exemplos de NormalizeVolume em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: MultiNormalizeProcess.py Projeto: tedunderwood/DataMunging

def processvolume(triplet):
    infile, outfile, integer = triplet
    print(integer)

    try:
        with open(infile, encoding='utf-8') as f:
            text = f.readlines()
    except:
        print("ERROR reading file " + infile)

    tokens, pre_matched, pre_english, pagedata, headerlist = NormalizeVolume.as_stream([text], verbose=False)

    correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(tokens, verbose = False)

    pagecounter = 0
    masterdict = dict()
    for page in pages:
        for item in page:
            if item in masterdict:
                masterdict[item] += page[item]
            else:
                masterdict[item] = page[item]

    with open(outfile, mode = 'w', encoding = 'utf-8') as f:
        for key, value in masterdict.items():
            if not key.startswith('#'):
                f.write(key + '\t' + str(value) + '\n')

    print(str(integer) + ' complete.')

    return "null"

Exemplo n.º 2

0

Exibir arquivo

def processvolume(triplet):
    infile, outfile, integer = triplet
    print(integer)

    try:
        with open(infile, encoding='utf-8') as f:
            text = f.readlines()
    except:
        print("ERROR reading file " + infile)

    tokens, pre_matched, pre_english, pagedata, headerlist = NormalizeVolume.as_stream(
        [text], verbose=False)

    correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(
        tokens, verbose=False)

    pagecounter = 0
    masterdict = dict()
    for page in pages:
        for item in page:
            if item in masterdict:
                masterdict[item] += page[item]
            else:
                masterdict[item] = page[item]

    with open(outfile, mode='w', encoding='utf-8') as f:
        for key, value in masterdict.items():
            if not key.startswith('#'):
                f.write(key + '\t' + str(value) + '\n')

    print(str(integer) + ' complete.')

    return "null"

Exemplo n.º 3

0

Exibir arquivo

def process_a_file(file_tuple):
	global testrun, pairtreepath, datapath, genremapdir, felecterrors, selecttruths, debug, phraseset, pagevocabset, meaningfulheaders

	thisID, metadata_evidence = file_tuple

	perfileerrorlog = list()
	return_dict = dict()
	return_dict["htid"] = thisID
	return_dict["metadata"] = (thisID, "0", "0", "0", "0", "0")
	return_dict["errors"] = []
	return_dict["phrasecounts"] = dict()

	if testrun:
		cleanID = clean_pairtree(thisID.replace("norm.txt", ""))
	else:
		cleanID = clean_pairtree(thisID)

	if not testrun:
		filepath, postfix = FileCabinet.pairtreepath(thisID, datapath)
		filename = filepath + postfix + '/' + postfix + ".zip"
	else:
		filename = datapath + thisID

	# ACTUALLY READ THE FILE.

	if filename.endswith('.zip'):
		pagelist, successflag = read_zip(filename)
	else:
		pagelist, successflag = read_txt(filename)

	if successflag == "missing file":
		print(thisID + " is missing.")
		perfileerrorlog.append(thisID + '\t' + "missing")
		return_dict["errors"] = perfileerrorlog
		return return_dict

	elif successflag == "pagination error":
		print(thisID + " has a pagination problem.")
		perfileerrorlog.append(thisID + '\t' + "paginationerror")
		return_dict["errors"] = perfileerrorlog
		return return_dict

	elif successflag == "unicode error":
		print(thisID + " can not be decoded by unicode.")
		perfileerrorlog.append(thisID + '\t' + "unicode error")
		return_dict["errors"] = perfileerrorlog
		return return_dict

	tokens, pre_matched, pre_english, pagedata, headerlist = NormalizeVolume.as_stream(pagelist, verbose=debug)

	if pre_english < 0.6:
		perfileerrorlog.append(thisID + '\t' + "not english")

	tokencount = len(tokens)

	if len(tokens) < 10:
		print(thisID, "has only tokencount", len(tokens))
		perfileerrorlog.append(thisID + '\t' + 'short')

	correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(tokens, verbose = debug)

	# Combine page dictionaries into a master dictionary.
	# If you ask, why didn't you just produce one in the first place? ...
	# answer has to do with flexibility of the Volume module for other purposes.

	pagecounter = 0
	masterdict = dict()
	for page in pages:
		for item in page:
			if item in masterdict:
				masterdict[item] += page[item]
			else:
				masterdict[item] = page[item]

	# Now that we have a master dictionary, consider whether there are long-s problems.
	# This algorithm works adequately.

	errors = 1
	truths = 1
	# Initialized to 1 as a Laplacian correction.

	for word in felecterrors:
		errors = errors + masterdict.get(word, 0)
	for word in selecttruths:
		truths = truths + masterdict.get(word, 0)

	if truths > errors:
		LongSproblem = False
	else:
		LongSproblem = True

	if LongSproblem == False:
		corrected = correct_tokens
		deleted = dict()
		added = dict()
	else:
		deleted, added, corrected, changedphrases, unchanged = Context.catch_ambiguities(correct_tokens, debug)
		# okay, this is crazy and not efficient to run, but it's easy to write and there are a small number
		# of these files -- so I'm going to count the new contextually-corrected tokens by re-running them
		# through Volume.
		correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(corrected, verbose = debug)

		corrected = correct_tokens

	# If we are upvoting tokens in the header, they need to be added here.

	if len(pages) != len(headerlist):
		print(thisID + " fails a routine check of alignment between pages and headers.")
	else:
		for index, page in enumerate(pages):
			thispageheader = headerlist[index]
			header_tokens, header_pages, dummy1, dummy2 = NormalizeVolume.correct_stream(thispageheader, verbose = debug)
			headerdict = header_pages[0]
			for key, value in headerdict.items():
				if key in meaningfulheaders:
					if key in page:
						page[key] += 2
						# a fixed increment no matter how many times the word occurs in the
						# header
					else:
						page[key] = 2
						print("Word " + key + " in headerdict for " + thisID + " at " + str(index) + " but not main page.")

	# Write corrected file.
	cleanHTID = clean_pairtree(thisID)

	if testrun:
		if cleanHTID.endswith(".clean.txt"):
			outHTID = cleanHTID.replace(".clean.txt", "")
		elif cleanHTID.endswith("norm.txt"):
			outHTID = cleanHTID.replace("norm.txt", ".norm.txt")
		elif cleanHTID.endswith(".txt"):
			outHTID = cleanHTID.replace(".txt", "norm.txt")
		else:
			outHTID = cleanHTID + ".norm.txt"

		outfilename = outpath + "texts/" + outHTID
	else:
		outfilename = filepath + postfix + '/' + postfix + ".norm.txt"

	with open(outfilename, mode = 'w', encoding = 'utf-8') as file:
		for token in corrected:
			if token != '\n' and token != "“" and not (token.startswith('<') and token.endswith('>')):
				token = token + " "
			file.write(token)

	if len(pages) != len(pagedata):
		perfileerrorlog.append("Discrepancy between page data and page metadata in \t" + thisID)
		return_dict["errors"] = perfileerrorlog
		return return_dict

	totalwordsinvol = 0

	if testrun:
		if cleanHTID.endswith(".clean.txt"):
			outHTID = cleanHTID.replace(".clean.txt", ".pg.tsv")
		elif cleanHTID.endswith("norm.txt"):
			outHTID = cleanHTID.replace("norm.txt", ".pg.tsv")
		elif cleanHTID.endswith(".txt"):
			outHTID = cleanHTID.replace(".txt", ".pg.tsv")
		else:
			outHTID = cleanHTID + ".pg.tsv"

		outfilename = outpath + "pagefeatures/" + outHTID
	else:
		outfilename = filepath + postfix + '/' + postfix + ".pg.tsv"

	with open(outfilename, mode = 'w', encoding = 'utf-8') as file:

		if metadata_evidence["biography"]:
			file.write("-1\t#metaBiography\t0\n")

		if metadata_evidence["drama"]:
			file.write("-1\t#metaDrama\t0\n")

		if metadata_evidence["fiction"]:
			file.write("-1\t#metaFiction\t0\n")

		if metadata_evidence["poetry"]:
			file.write("-1\t#metaPoetry\t0\n")

		numberofpages = len(pages)
		for index, page in enumerate(pages):

			# This is a shameful hack that should be deleted later.
			if testrun and "estimated" in page and "percentage" in page and (index + 3) > numberofpages:
				continue
			if testrun and "untypical" in page and (index +2) > numberofpages:
				continue

			otherfeatures = 0

			for feature, count in page.items():
				if feature in pagevocabset or feature.startswith("#"):
					outline = str(index) + '\t' + feature + '\t' + str(count) + '\n'
					# pagenumber, featurename, featurecount
					file.write(outline)
				else:
					otherfeatures += count

				if not feature.startswith("#"):
					totalwordsinvol += count
				# This is because there are structural features like #allcapswords
				# that should not be counted toward total token count.

			structural_features = pagedata[index]
			for feature, count in structural_features.items():
				if count > 0 or feature == "#textlines":
					outline = str(index) + '\t' + feature + '\t' + str(count) + '\n'
					file.write(outline)

			if otherfeatures > 0:
				outline = str(index) + '\t' + "wordNotInVocab" + '\t' + str(otherfeatures) + '\n'
				file.write(outline)

	metatuple = (thisID, str(totalwordsinvol), str(pre_matched), str(pre_english), str(post_matched), str(post_english))

	return_dict["metadata"] = metatuple
	return_dict["errors"] = perfileerrorlog

	return return_dict

Exemplo n.º 4

0

Exibir arquivo

Arquivo: MultiNormalizeOCR.py Projeto: cmchurch/DataMunging

def process_a_file(file_tuple):
	global testrun, pairtreepath, datapath, genremapdir, felecterrors, selecttruths, debug, phraseset, pagevocabset, meaningfulheaders

	thisID, metadata_evidence = file_tuple

	perfileerrorlog = list()
	return_dict = dict()
	return_dict["htid"] = thisID
	return_dict["metadata"] = (thisID, "0", "0", "0", "0", "0")
	return_dict["errors"] = []
	return_dict["phrasecounts"] = dict()

	if testrun:
		cleanID = clean_pairtree(thisID.replace("norm.txt", ""))
	else:
		cleanID = clean_pairtree(thisID)

	if not testrun:
		filepath, postfix = FileCabinet.pairtreepath(thisID, datapath)
		filename = filepath + postfix + '/' + postfix + ".zip"
	else:
		filename = datapath + thisID

	# ACTUALLY READ THE FILE.

	if filename.endswith('.zip'):
		pagelist, successflag = read_zip(filename)
	else:
		pagelist, successflag = read_txt(filename)

	if successflag == "missing file":
		print(thisID + " is missing.")
		perfileerrorlog.append(thisID + '\t' + "missing")
		return_dict["errors"] = perfileerrorlog
		return return_dict

	elif successflag == "pagination error":
		print(thisID + " has a pagination problem.")
		perfileerrorlog.append(thisID + '\t' + "paginationerror")
		return_dict["errors"] = perfileerrorlog
		return return_dict

	tokens, pre_matched, pre_english, pagedata, headerlist = NormalizeVolume.as_stream(pagelist, verbose=debug)

	if pre_english < 0.6:
		perfileerrorlog.append(thisID + '\t' + "not english")

	tokencount = len(tokens)

	if len(tokens) < 10:
		print(thisID, "has only tokencount", len(tokens))
		perfileerrorlog.append(thisID + '\t' + 'short')

	correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(tokens, verbose = debug)

	# Combine page dictionaries into a master dictionary.
	# If you ask, why didn't you just produce one in the first place? ...
	# answer has to do with flexibility of the Volume module for other purposes.

	pagecounter = 0
	masterdict = dict()
	for page in pages:
		for item in page:
			if item in masterdict:
				masterdict[item] += page[item]
			else:
				masterdict[item] = page[item]

	# Now that we have a master dictionary, consider whether there are long-s problems.
	# This algorithm works adequately.

	errors = 1
	truths = 1
	# Initialized to 1 as a Laplacian correction.

	for word in felecterrors:
		errors = errors + masterdict.get(word, 0)
	for word in selecttruths:
		truths = truths + masterdict.get(word, 0)

	if truths > errors:
		LongSproblem = False
	else:
		LongSproblem = True

	if LongSproblem == False:
		corrected = correct_tokens
		deleted = dict()
		added = dict()
	else:
		deleted, added, corrected, changedphrases, unchanged = Context.catch_ambiguities(correct_tokens, debug)
		# okay, this is crazy and not efficient to run, but it's easy to write and there are a small number
		# of these files -- so I'm going to count the new contextually-corrected tokens by re-running them
		# through Volume.
		correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(corrected, verbose = debug)

		corrected = correct_tokens

	# If we are upvoting tokens in the header, they need to be added here.

	for index, page in enumerate(pages):
		thispageheader = headerlist[index]
		header_tokens, header_pages, dummy1, dummy2 = NormalizeVolume.correct_stream(thispageheader, verbose = debug)
		headerdict = header_pages[0]
		for key, value in headerdict.items():
			if key in meaningfulheaders:
				if key in page:
					page[key] += 2
					# a fixed increment no matter how many times the word occurs in the
					# header
				else:
					page[key] = 2
					print("Word " + key + " in headerdict for " + thisID + " at " + str(index) + " but not main page.")

	# Write corrected file.
	cleanHTID = clean_pairtree(thisID)

	if testrun:
		if cleanHTID.endswith(".clean.txt"):
			outHTID = cleanHTID.replace(".clean.txt", "")
		elif cleanHTID.endswith("norm.txt"):
			outHTID = cleanHTID.replace("norm.txt", ".norm.txt")
		elif cleanHTID.endswith(".txt"):
			outHTID = cleanHTID.replace(".txt", "norm.txt")
		else:
			outHTID = cleanHTID + ".norm.txt"

		outfilename = outpath + "texts/" + outHTID
	else:
		outfilename = filepath + postfix + '/' + postfix + ".norm.txt"

	with open(outfilename, mode = 'w', encoding = 'utf-8') as file:
		for token in corrected:
			if token != '\n' and token != "“" and not (token.startswith('<') and token.endswith('>')):
				token = token + " "
			file.write(token)

	if len(pages) != len(pagedata):
		perfileerrorlog.append("Discrepancy between page data and page metadata in \t" + thisID)

	totalwordsinvol = 0

	if testrun:
		if cleanHTID.endswith(".clean.txt"):
			outHTID = cleanHTID.replace(".clean.txt", ".pg.tsv")
		elif cleanHTID.endswith("norm.txt"):
			outHTID = cleanHTID.replace("norm.txt", ".pg.tsv")
		elif cleanHTID.endswith(".txt"):
			outHTID = cleanHTID.replace(".txt", ".pg.tsv")
		else:
			outHTID = cleanHTID + ".pg.tsv"

		outfilename = outpath + "pagefeatures/" + outHTID
	else:
		outfilename = filepath + postfix + '/' + postfix + ".pg.tsv"

	with open(outfilename, mode = 'w', encoding = 'utf-8') as file:

		if metadata_evidence["biography"]:
			file.write("-1\t#metaBiography\t0\n")

		if metadata_evidence["drama"]:
			file.write("-1\t#metaDrama\t0\n")

		if metadata_evidence["fiction"]:
			file.write("-1\t#metaFiction\t0\n")

		if metadata_evidence["poetry"]:
			file.write("-1\t#metaPoetry\t0\n")

		numberofpages = len(pages)
		for index, page in enumerate(pages):

			# This is a shameful hack that should be deleted later.
			if testrun and "estimated" in page and "percentage" in page and (index + 3) > numberofpages:
				continue
			if testrun and "untypical" in page and (index +2) > numberofpages:
				continue

			otherfeatures = 0

			for feature, count in page.items():
				if feature in pagevocabset or feature.startswith("#"):
					outline = str(index) + '\t' + feature + '\t' + str(count) + '\n'
					# pagenumber, featurename, featurecount
					file.write(outline)
				else:
					otherfeatures += count

				if not feature.startswith("#"):
					totalwordsinvol += count
				# This is because there are structural features like #allcapswords
				# that should not be counted toward total token count.

			structural_features = pagedata[index]
			for feature, count in structural_features.items():
				if count > 0 or feature == "#textlines":
					outline = str(index) + '\t' + feature + '\t' + str(count) + '\n'
					file.write(outline)

			if otherfeatures > 0:
				outline = str(index) + '\t' + "wordNotInVocab" + '\t' + str(otherfeatures) + '\n'
				file.write(outline)

	metatuple = (thisID, str(totalwordsinvol), str(pre_matched), str(pre_english), str(post_matched), str(post_english))

	return_dict["metadata"] = metatuple
	return_dict["errors"] = perfileerrorlog

	return return_dict

Exemplo n.º 5

0

Exibir arquivo

debug = False

pathdictionary = FileCabinet.loadpathdictionary('/Users/tunder/Dropbox/PythonScripts/workflow/PathDictionary.txt')

datapath = pathdictionary['datapath']
metadatapath = pathdictionary['metadatapath']
metaoutpath = pathdictionary['metaoutpath']
outpath = pathdictionary['outpath']

targetfile = sys.argv[1]

with open(targetfile, encoding='utf-8') as f:
    text = f.readlines()

tokens, pre_matched, pre_english, pagedata, headerlist = NormalizeVolume.as_stream([text], verbose=debug)

correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(tokens, verbose = debug)

pagecounter = 0
masterdict = dict()
for page in pages:
    for item in page:
        if item in masterdict:
            masterdict[item] += page[item]
        else:
            masterdict[item] = page[item]


for key, value in masterdict.items():
    if not key.startswith('#'):

Exemplo n.º 6

0

Exibir arquivo

Arquivo: NormalizeOCR.py Projeto: thomasgpadilla/DataMunging

        pagelist, successflag = read_txt(filename)

    if successflag == "missing file":
        print(thisID + " is missing.")
        errorlog.append(thisID + '\t' + "missing")
        continue
    elif successflag == "pagination error":
        print(thisID + " has a pagination problem.")
        errorlog.append(thisID + '\t' + "paginationerror")
        continue

    genremappath = genremapdir + cleanID + ".predict"

    genremap = get_map(genremappath)

    tokens, pre_matched, pre_english, pagedata, headerlist = NormalizeVolume.as_stream(
        pagelist, verbose=debug)

    if pre_english < 0.6:
        print(thisID + " is suspicious.")
        errorlog.append(thisID + '\t' + "not english")
    else:
        if len(genremap) > 0:
            genreset = set(genremap.values())
            thisvoldict = PhraseCounter.count_phrases(tokens, genremap,
                                                      phraseset, genreset,
                                                      cleanID)
            phrasecount[thisID] = thisvoldict

    # with open(headeroutpath, mode="a", encoding="utf-8") as f:
    #     for astream in headerlist:
    #         if len(astream) > 0:

Exemplo n.º 7

0

Exibir arquivo

Arquivo: NormalizeOCR.py Projeto: Anterotesis/DataMunging

        pagelist, successflag = read_txt(filename)

    if successflag == "missing file":
        print(thisID + " is missing.")
        errorlog.append(thisID + '\t' + "missing")
        continue
    elif successflag == "pagination error":
        print(thisID + " has a pagination problem.")
        errorlog.append(thisID + '\t' + "paginationerror")
        continue

    genremappath = genremapdir + cleanID + ".predict"

    genremap = get_map(genremappath)

    tokens, pre_matched, pre_english, pagedata, headerlist = NormalizeVolume.as_stream(pagelist, verbose=debug)

    if pre_english < 0.6:
        print(thisID + " is suspicious.")
        errorlog.append(thisID + '\t' + "not english")
    else:
        if len(genremap) > 0:
            genreset = set(genremap.values())
            thisvoldict = PhraseCounter.count_phrases(tokens, genremap, phraseset, genreset, cleanID)
            phrasecount[thisID] = thisvoldict

    # with open(headeroutpath, mode="a", encoding="utf-8") as f:
    #     for astream in headerlist:
    #         if len(astream) > 0:
    #             outline = " ".join([x for x in astream])
    #             f.write(outline + '\n')