示例#1
0
def display_non_duplicate_plutarch(filenames):
    d = {}
    for name in filenames:
        d[name] = hash(open(name).read())
    rev_multidict = {}
    for key, value in d.items():
        rev_multidict.setdefault(value, set()).add(key)

    dup_names = (v for k, v in rev_multidict.items() if len(v) > 1)
    dup_plutarch_filenames = []
    for duplicates in dup_names:
        for elem in duplicates:
            if elem.startswith('tesserae/texts/grc/plutarch/'):
                dup_plutarch_filenames.append(elem)
    dup_plutarch_filenames = sorted(dup_plutarch_filenames)
    plutarch_filenames = sorted(
        _get_filenames('tesserae/texts/grc/plutarch', 'tess', set()))

    #Prints files in the plutarch directory that have been identified as duplicates
    print('\n'.join(dup_plutarch_filenames))
    print()

    #Prints all files in the plutarch directory
    print('\n'.join(plutarch_filenames))
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktLanguageVars
from functools import reduce
import os
from os.path import join
from extract_features import _get_filenames, parse_tess
from greek_features import composite_files_to_exclude
from textual_feature import sentence_tokenizers

corpus_dir = join('tesserae', 'texts', 'grc')
files = _get_filenames(corpus_dir, 'tess', composite_files_to_exclude)

f = open('sentence_counts.csv', mode='w')
f.write(
    'Data: https://github.com/timgianitsos/tesserae/tree/master/texts/grc,Project: https://www.qcrit.org,Author: Tim Gianitsos ([email protected]),Repo (Private): https://github.com/jdexter476/ProseVerseClassification.git,Code commit: '
    + os.popen('git rev-parse HEAD').read().strip() + ',Corpus commit: ' +
    os.popen('git -C "./tesserae" rev-parse HEAD').read().strip() + '\n')
f.write('file name,number of sentences\n')
for file in files:
    file_text = parse_tess(file)
    num_sentences = len(
        sentence_tokenizers['ancient_greek'].tokenize(file_text))
    f.write(file[file.rindex(os.sep) + 1:] + ',' + str(num_sentences) + '\n')
print('Success!')
    if line[1] not in labels:
        labels[line[1]] = len(labels)
    vg[line[0]] = line[1]
cnt.update(vg.values())
assert len(vg) == 141
assert vg['aeschylus.agamemnon.tess'] == 'drama'
assert vg['tryphiodorus.the_taking_of_ilios.tess'] == 'epic'
assert sum(cnt.values()) == len(pg) + len(vg)
print('Verse genres:', set(vg.values()))
print('Counts:', cnt)

print('Category key:', labels)

filename_to_path = {
    s[s.rindex(os.sep) + 1:]: s
    for s in _get_filenames('tesserae/texts/grc', 'tess', set())
}
path_to_filename = {
    s: s[s.rindex(os.sep) + 1:]
    for s in _get_filenames('tesserae/texts/grc', 'tess', set())
}
assert len(filename_to_path) == len(path_to_filename)
file_to_genre = dict(**pg, **vg)
output_file = 'genre_labels.csv'
print('Writing to ' + output_file + '...')
f = open(output_file, 'w')
f.write(','.join(k + ':' + str(v) for k, v in labels.items()) + '\n')
f.write('Filename,Genre\n')
for path in sorted(
        filename_to_path.values()
):  #Iterate over sorted values so that the order will match prosody_labels.csv