def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre. Also reports the largest genre.'''

    genrecounts = dict()
    genrecounts['fic'] = 0
    genrecounts['poe'] = 0
    genrecounts['dra'] = 0
    genrecounts['non'] = 0

    for page in genresequence:
        indexas = page

        # For this purpose, we treat biography and indexes as equivalent to nonfiction.
        if page == "bio" or page == "index" or page == "back" or page == "trv":
            indexas = "non"

        utils.addtodict(indexas, 1, genrecounts)

    # Convert the dictionary of counts into a sorted list, and take the max.
    genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse=True)
    maxgenre = genretuples[0][1]

    return genrecounts, maxgenre
Пример #2
0
def select_common_features(trainingset, n):
    ''' Very simply, selects the top n features in the training set.
	Not a sophisticated feature-selection strategy, but in many
	cases it gets the job done.
	'''
    allwordcounts = dict()

    for avolume in trainingset:
        utils.add_dicts(avolume.rawcounts, allwordcounts)
        # The add_dicts function will add up all the raw counts into
        # a single master dictionary.

    descendingbyfreq = utils.sortkeysbyvalue(allwordcounts,
                                             whethertoreverse=True)
    # This returns a list of 2-tuple (frequency, word) pairs.

    if n > len(descendingbyfreq):
        n = len(descendingbyfreq)
        print("We only have " + str(n) + " features.")

    # List comprehension that gets the second element of each tuple, up to
    # a total of n tuples.

    topfeatures = [x[1] for x in descendingbyfreq[0:n]]

    return topfeatures
Пример #3
0
def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre. Also reports the largest genre.
    Note that this function cannot return "bio." If
    biography is the largest genre it returns "non"fiction.
    It counts bio, but ensures that all votes for bio are also votes
    for non.
    '''

    genrecounts = dict()

    for page in genresequence:
        utils.addtodict(page, 1, genrecounts)
        if page == 'bio':
            utils.addtodict('non', 1, genrecounts)

    # Convert the dictionary of counts into a sorted list, and take the max.
    genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True)
    maxgenre = genretuples[0][1]

    if maxgenre == 'bio':
        maxgenre = 'non'

    return genrecounts, maxgenre
def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre. Also reports the largest genre.'''

    genrecounts = dict()
    genrecounts['fic'] = 0
    genrecounts['poe'] = 0
    genrecounts['dra'] = 0
    genrecounts['non'] = 0

    for page in genresequence:
        indexas = page

        # For this purpose, we treat biography and indexes as equivalent to nonfiction.
        if page == "bio" or page == "index" or page == "back" or page == "trv":
            indexas = "non"

        utils.addtodict(indexas, 1, genrecounts)

    # Convert the dictionary of counts into a sorted list, and take the max.
    genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True)
    maxgenre = genretuples[0][1]

    return genrecounts, maxgenre
Пример #5
0
def select_common_features(trainingset, n):
	''' Very simply, selects the top n features in the training set.
	Not a sophisticated feature-selection strategy, but in many
	cases it gets the job done.
	'''
	allwordcounts = dict()

	for avolume in trainingset:
		utils.add_dicts(avolume.rawcounts, allwordcounts)
		# The add_dicts function will add up all the raw counts into
		# a single master dictionary.

	descendingbyfreq = utils.sortkeysbyvalue(allwordcounts, whethertoreverse = True)
	# This returns a list of 2-tuple (frequency, word) pairs.

	if n > len(descendingbyfreq):
		n = len(descendingbyfreq)
		print("We only have " + str(n) + " features.")

	# List comprehension that gets the second element of each tuple, up to
	# a total of n tuples.

	topfeatures = [x[1] for x in descendingbyfreq[0 : n]]

	return topfeatures
Пример #6
0
def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre. Also reports the largest genre.
    Note that this function cannot return "bio." If
    biography is the largest genre it returns "non"fiction.
    It counts bio, but ensures that all votes for bio are also votes
    for non.
    '''

    genrecounts = dict()

    for page in genresequence:
        utils.addtodict(page, 1, genrecounts)
        if page == 'bio':
            utils.addtodict('non', 1, genrecounts)

    # Convert the dictionary of counts into a sorted list, and take the max.
    genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse=True)
    maxgenre = genretuples[0][1]

    if maxgenre == 'bio':
        maxgenre = 'non'

    return genrecounts, maxgenre
def comparelists(firstmap, secondmap, genremistakes, correctbygenre,
                 wordcounts):
    if len(firstmap) > len(secondmap):
        length = len(secondmap)
    elif len(firstmap) == len(secondmap):
        length = len(firstmap)
    else:
        print(
            "Error, Will Robinson. There are occasions where the consensus version is shorter but no valid reason for it to be longer."
        )

    divergence = 0.0

    for i in range(length):

        generalizedfirst = translate(firstmap[i])
        generalizedsecond = translate(secondmap[i])

        if effectively_equal(generalizedfirst, generalizedsecond):
            utils.addtodict(generalizedsecond, wordcounts[i], correctbygenre)
        else:
            divergence += wordcounts[i]
            utils.addtodict((generalizedsecond, generalizedfirst),
                            wordcounts[i], genremistakes)

    return divergence
Пример #8
0
def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre.

    Note that this version of the function is slightly different
    from the version in MetadataCascades, in allowing a wider range
    of genres and not initializing anything to zero.'''

    genrecounts = dict()

    for page in genresequence:
        utils.addtodict(page, 1, genrecounts)

    return genrecounts
Пример #9
0
def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre.

    Note that this version of the function is slightly different
    from the version in MetadataCascades, in allowing a wider range
    of genres and not initializing anything to zero.'''

    genrecounts = dict()

    for page in genresequence:
        utils.addtodict(page, 1, genrecounts)

    return genrecounts
Пример #10
0
def get_featureframe(vocabulary, positiveIDs, negativeIDs, sourcedir):
    ''' Returns a pandas dataframe with feature counts for all the volumes
    to be used in this model.
    '''

    df = dict()
    # We initially construct the data frame as a dictionary of Series.
    vocabset = set(vocabulary)
    allIDs = positiveIDs + negativeIDs

    for v in vocabulary:
        df[v] = pd.Series(np.zeros(len(allIDs)), index = allIDs)

    for docid in allIDs:
        path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
        with open(path, encoding = 'utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                feature = row['feature']

                if feature.startswith('#header'):
                    feature = feature.replace('#header', '')

                if feature in vocabset:
                    df[feature].loc[docid] = row['count']

    # Now let's refashion the dictionary as an actual dataframe.
    df = pd.DataFrame(df, index = allIDs)
    df = df[vocabulary]
    # This reorders the columns to be in vocab order

    stdscaler = StandardScaler()
    scaleddf = pd.DataFrame(stdscaler.fit_transform(df), index = allIDs)

    return scaleddf
Пример #11
0
def get_vocabulary_and_counts_4pages(metadata, allIDs, sourcedir, n):
    ''' Gets the top n words by docfrequency, but also
    returns a dictionary of wordcounts so we don't have to read them again from the
    file when generating a feature dataframe.

    Adjusted to handle page instances.
    '''

    doc_freq = Counter()
    counts = dict()
    id2group = dict()

    for docid in allIDs:

        path = os.path.join(sourcedir,
                            utils.clean_pairtree(docid) + '.basic.json.bz2')
        volume = parser.PagelistFromJson(path, docid)
        pagecounts = volume.get_feature_list()

        for idx, page in enumerate(pagecounts):
            pageid = docid + '||' + str(idx)

            id2group[pageid] = docid

            counts[pageid] = page
            for key, value in page.items():
                doc_freq[key] += 1

    vocab = [x[0] for x in doc_freq.most_common(n)]
    print('Vocabulary constructed.')

    return vocab, counts, id2group
Пример #12
0
def add_to_ficgenre(docid, existingfile, tagas):
    global outfieldnames, metadata
    with open(existingfile, mode = 'a', encoding = 'utf-8') as f:
        writer = csv.DictWriter(f, fieldnames = outfieldnames)
        o = dict()
        j = metadata[docid]
        fields = [j['HTid'], str(j['date']), j['author'], j['title'], j['enumcron']]
        print(" | ".join(fields))
        o['docid'] = utils.clean_pairtree(j['HTid'])
        o['recordid'] = j['recordid']
        o['oclc'] = j['OCLC']
        o['locnum'] = j['LOCnum']
        o['author'] = j['author']
        o['imprint'] = j['imprint']
        o['date'] = j['date']
        o['firstpub'] = input('First publication date? ')
        o['birthdate'] = input('Author birth year? ')
        o['nationality'] = input('Nationality? ')
        o['gender'] = input('Gender? ')
        o['title'] = j['title']
        o['subjects'] = j['subjects']
        o['enumcron'] = j['enumcron']
        o['genretags'] = tagas
        for key, value in o.items():
            if o[key] == '<blank>':
                o[key] = ''
        writer.writerow(o)
    print('Done.')
Пример #13
0
def get_classvector(classpath, volumeIDs):
    with open(classpath, encoding='utf-8') as f:
        filelines = f.readlines()
    classdict = dict()
    for line in filelines:
        line = line.rstrip()
        fields = line.split('\t')
        volid = utils.clean_pairtree(fields[0])
        theclass = fields[1]
        if theclass == 'elite':
            intclass = 1
        elif theclass == 'vulgar':
            intclass = 0
        else:
            intclass = int(theclass)
        classdict[volid] = intclass

    if len(volumeIDs) < 1:
        volumeIDs = [x for x in classdict.keys()]

    classvector = np.zeros(len(volumeIDs))
    for idx, anid in enumerate(volumeIDs):
        if anid in classdict:
            classvector[idx] = classdict[anid]
        else:
            print('Missing from class metadata: ' + anid)

    return classvector, volumeIDs
Пример #14
0
def get_featureframe(vocabulary, positiveIDs, negativeIDs, sourcedir):
    ''' Returns a pandas dataframe with feature counts for all the volumes
    to be used in this model.
    '''

    df = dict()
    # We initially construct the data frame as a dictionary of Series.
    vocabset = set(vocabulary)
    allIDs = positiveIDs + negativeIDs

    for v in vocabulary:
        df[v] = pd.Series(np.zeros(len(allIDs)), index=allIDs)

    for docid in allIDs:
        path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
        with open(path, encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                feature = row['feature']

                if feature.startswith('#header'):
                    feature = feature.replace('#header', '')

                if feature in vocabset:
                    df[feature].loc[docid] = row['count']

    # Now let's refashion the dictionary as an actual dataframe.
    df = pd.DataFrame(df, index=allIDs)
    df = df[vocabulary]
    # This reorders the columns to be in vocab order

    stdscaler = StandardScaler()
    scaleddf = pd.DataFrame(stdscaler.fit_transform(df), index=allIDs)

    return scaleddf
Пример #15
0
def get_classvector(classpath, volumeIDs):
	with open(classpath, encoding = 'utf-8') as f:
		filelines = f.readlines()
	classdict = dict()
	for line in filelines:
		line = line.rstrip()
		fields = line.split('\t')
		volid = utils.clean_pairtree(fields[0])
		theclass = fields[1]
		if theclass == 'elite':
			intclass = 1
		elif theclass == 'vulgar':
			intclass = 0
		else:
			intclass = int(theclass)
		classdict[volid] = intclass

	if len(volumeIDs) < 1:
		volumeIDs = [x for x in classdict.keys()]

	classvector = np.zeros(len(volumeIDs))
	for idx, anid in enumerate(volumeIDs):
		if anid in classdict:
			classvector[idx] = classdict[anid]
		else:
			print('Missing from class metadata: ' + anid)

	return classvector, volumeIDs
Пример #16
0
def choose_cascade(htid):
    '''Reads metadata about this volume and uses it to decide what metadata-level features should be assigned.'''

    global rowindices, columns, metadata, modelindices, modeldata

    probablydrama = False
    probablypoetry = False
    probablybiography = False
    probablyfiction = False
    maybefiction = False

    htid = utils.pairtreelabel(htid)
    # convert the clean pairtree filename into a dirty pairtree label for metadata matching

    if htid not in rowindices:
        # We have no metadata for this volume.
        print("Volume missing from ExtractedMetadata.tsv: " + htid)

    else:
        genrestring = metadata["genres"][htid]
        genreinfo = genrestring.split(";")
        # It's a semicolon-delimited list of items.

        for info in genreinfo:

            if info == "Biography" or info == "Autobiography":
                probablybiography = True

            if info == "Fiction" or info == "Novel":
                probablyfiction = True

            if (info == "Poetry" or info == "Poems"):
                probablypoetry = True

            if (info == "Drama" or info == "Tragedies" or info == "Comedies"):
                probablydrama = True

    if htid in modelindices:

        title = metadata["title"][htid].lower()
        titlewords = title.split()

        maxgenre = maxoption((modeldata["bio"][htid], modeldata["dra"][htid],
                              modeldata["fic"][htid], modeldata["non"][htid],
                              modeldata["poe"][htid]))

        if maxgenre == 4 and "poems" in titlewords or "poetical" in titlewords:
            probablypoetry = True

        if maxgenre == 1:
            probablydrama = True

        if maxgenre == 2:
            maybefiction = True

    return probablybiography, probablydrama, probablyfiction, probablypoetry, maybefiction
def choose_cascade(htid):
    '''Reads metadata about this volume and uses it to decide what metadata-level features should be assigned.'''

    global rowindices, columns, metadata, modelindices, modeldata


    probablydrama = False
    probablypoetry = False
    probablybiography = False
    probablyfiction = False
    maybefiction = False

    htid = utils.pairtreelabel(htid)
    # convert the clean pairtree filename into a dirty pairtree label for metadata matching

    if htid not in rowindices:
        # We have no metadata for this volume.
        print("Volume missing from ExtractedMetadata.tsv: " + htid)

    else:
        genrestring = metadata["genres"][htid]
        genreinfo = genrestring.split(";")
        # It's a semicolon-delimited list of items.

        for info in genreinfo:

            if info == "Biography" or info == "Autobiography":
                probablybiography = True

            if info == "Fiction" or info == "Novel":
                probablyfiction = True

            if (info == "Poetry" or info == "Poems"):
                probablypoetry = True

            if (info == "Drama" or info == "Tragedies" or info == "Comedies"):
                probablydrama = True

    if htid in modelindices:

        title = metadata["title"][htid].lower()
        titlewords = title.split()

        maxgenre = maxoption((modeldata["bio"][htid], modeldata["dra"][htid], modeldata["fic"][htid], modeldata["non"][htid], modeldata["poe"][htid]))

        if maxgenre == 4 and "poems" in titlewords or "poetical" in titlewords:
            probablypoetry = True

        if maxgenre == 1:
            probablydrama = True

        if maxgenre == 2:
            maybefiction = True

    return probablybiography, probablydrama, probablyfiction, probablypoetry, maybefiction
Пример #18
0
def compare_two_lists(truelist, predicted, wordsperpage, whethertocountwords):
    global genretranslations
    assert (len(truelist) == len(predicted))

    errorsbygenre = dict()
    correctbygenre = dict()
    accurate = 0
    inaccurate = 0
    totaltruegenre = dict()

    for index, truegenre in enumerate(truelist):
        if truegenre in genretranslations:
            truegenre = genretranslations[truegenre]

        if whethertocountwords:
            increment = wordsperpage[index]
        else:
            increment = 1

        utils.addtodict(truegenre, increment, totaltruegenre)

        predictedgenre = predicted[index]

        if genresareequal(truegenre, predictedgenre):
            utils.addtodict(truegenre, increment, correctbygenre)
            accurate += increment
        else:
            utils.addtodict((truegenre, predictedgenre), increment,
                            errorsbygenre)
            inaccurate += increment

    return totaltruegenre, correctbygenre, errorsbygenre, accurate, inaccurate
def compare_two_lists(truelist, predicted, wordsperpage, whethertocountwords):
    global genretranslations
    assert(len(truelist) == len(predicted))

    errorsbygenre = dict()
    correctbygenre = dict()
    accurate = 0
    inaccurate = 0
    totaltruegenre = dict()

    for index, truegenre in enumerate(truelist):
        if truegenre in genretranslations:
            truegenre = genretranslations[truegenre]

        if whethertocountwords:
            increment = wordsperpage[index]
        else:
            increment = 1

        utils.addtodict(truegenre, increment, totaltruegenre)

        predictedgenre = predicted[index]

        if genresareequal(truegenre, predictedgenre):
            utils.addtodict(truegenre, increment, correctbygenre)
            accurate += increment
        else:
            utils.addtodict((truegenre, predictedgenre), increment, errorsbygenre)
            inaccurate += increment

    return totaltruegenre, correctbygenre, errorsbygenre, accurate, inaccurate
Пример #20
0
def comparelists(firstmap, secondmap, genremistakes, correctbygenre, wordcounts):
	if len(firstmap) > len(secondmap):
		length = len(secondmap)
	elif len(firstmap) == len(secondmap):
		length = len(firstmap)
	else:
		print("Error, Will Robinson. There are occasions where the consensus version is shorter but no valid reason for it to be longer.")

	divergence = 0.0

	for i in range(length):

		generalizedfirst = translate(firstmap[i])
		generalizedsecond = translate(secondmap[i])

		if effectively_equal(generalizedfirst, generalizedsecond):
			utils.addtodict(generalizedsecond, wordcounts[i], correctbygenre)
		else:
			divergence += wordcounts[i]
			utils.addtodict((generalizedsecond, generalizedfirst), wordcounts[i], genremistakes)

	return divergence
Пример #21
0
    def addmetadata(self, row, table):
        self.author = table['author'][row]
        self.title = table['title'][row]
        self.date = utils.simple_date(row, table)
        genrelist = table['genres'][row].split(';')
        self.genres = set(genrelist)

        varietiesofnon = ['Bibliographies', 'Catalog', 'Dictionary', 'Encyclopedia', 'Handbooks', 'Indexes', 'Legislation', 'Directories', 'Statistics', 'Legal cases', 'Legal articles', 'Calendars', 'Autobiography', 'Biography', 'Letters', 'Essays', 'Speeches']

        self.nonmetaflag = False
        for genre in varietiesofnon:
            if genre in self.genres:
                self.nonmetaflag = True
Пример #22
0
    def addmetadata(self, row, table):
        self.author = table['author'][row]
        self.title = table['title'][row]
        self.date = utils.simple_date(row, table)
        genrelist = table['genres'][row].split(';')
        self.genres = set(genrelist)

        varietiesofnon = ['Bibliographies', 'Catalog', 'Dictionary', 'Encyclopedia', 'Handbooks', 'Indexes', 'Legislation', 'Directories', 'Statistics', 'Legal cases', 'Legal articles', 'Calendars', 'Autobiography', 'Biography', 'Letters', 'Essays', 'Speeches']

        self.nonmetaflag = False
        for genre in varietiesofnon:
            if genre in self.genres:
                self.nonmetaflag = True
def resolve_voting(votes, tiebreaker):
    electorate = len(votes)

    results = dict()
    for vote in votes:
        # if vote == "bio":
        #   vote = "non"
        utils.addtodict(vote, 1, results)
    candidate = utils.sortkeysbyvalue(results, whethertoreverse = True)

    dissent = (electorate - candidate[0][0]) / electorate

    if len(candidate) < 2:
        # There is only one candidate.
        return candidate[0][1], dissent, candidate[0][1]

    elif candidate[0][0] > candidate[1][0]:
        # We have a majority.
        return candidate[0][1], dissent, candidate[1][1]

    else:
        # We have a tie.
        if tiebreaker == candidate[0][1]:
            print("Tiebreaker " + tiebreaker)
            return candidate[0][1], dissent, candidate[1][1]
        elif tiebreaker == candidate[1][1]:
            print("Tiebreaker " + tiebreaker)
            return candidate[1][1], dissent, candidate[0][1]
        else:
            print("Tie in spite of " + tiebreaker)
            win = random.choice([candidate[0][1], candidate[1][1]])
            if win == candidate[0][1]:
                runnerup = candidate[1][1]
            else:
                runnerup = candidate[0][1]

            return win, dissent, runnerup
Пример #24
0
def resolve_voting(votes, tiebreaker):
    electorate = len(votes)

    results = dict()
    for vote in votes:
        # if vote == "bio":
        #   vote = "non"
        utils.addtodict(vote, 1, results)
    candidate = utils.sortkeysbyvalue(results, whethertoreverse=True)

    dissent = (electorate - candidate[0][0]) / electorate

    if len(candidate) < 2:
        # There is only one candidate.
        return candidate[0][1], dissent, candidate[0][1]

    elif candidate[0][0] > candidate[1][0]:
        # We have a majority.
        return candidate[0][1], dissent, candidate[1][1]

    else:
        # We have a tie.
        if tiebreaker == candidate[0][1]:
            print("Tiebreaker " + tiebreaker)
            return candidate[0][1], dissent, candidate[1][1]
        elif tiebreaker == candidate[1][1]:
            print("Tiebreaker " + tiebreaker)
            return candidate[1][1], dissent, candidate[0][1]
        else:
            print("Tie in spite of " + tiebreaker)
            win = random.choice([candidate[0][1], candidate[1][1]])
            if win == candidate[0][1]:
                runnerup = candidate[1][1]
            else:
                runnerup = candidate[0][1]

            return win, dissent, runnerup
Пример #25
0
def get_vocabulary_and_counts(metadata, positiveIDs, negativeIDs, sourcedir,
                              n):
    ''' Gets the top n words by docfrequency in positiveIDs + negativeIDs, but also
    returns a dictionary of wordcounts so we don't have to read them again from the
    file when generating a feature dataframe.
    '''

    allIDs = positiveIDs + negativeIDs

    doc_freq = Counter()
    counts = dict()

    for docid in allIDs:
        counts[docid] = Counter()
        path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
        with open(path, encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                word = row['feature']
                if len(word) < 1:
                    continue

                ct = float(row['count'])

                if word.startswith('#header'):
                    word = word.replace('#header', '')
                #
                # This debatable choice treats header words as equivalent
                # to occurrences in the body text. In practice, this seems
                # to slightly improve performance, at least when you're using
                # SVMs and relatively low numbers of features (140-300).
                # Otherwise header words are in practice just discarded, because
                # e.g. #headeract won't be one of the top 250 words.

                doc_freq[word] += 1
                counts[docid][word] += ct

                # # experimental
                # if word.startswith('#'):
                #     squaredfeature = word + 'sqrd'
                #     counts[docid][word] = ct * ct

    vocab = [x[0] for x in doc_freq.most_common(n)]
    print('Vocabulary constructed.')

    return vocab, counts
Пример #26
0
def get_genrevector(volumeIDs, boundarydef):
    global epindices, nonindices

    n = len(volumeIDs)

    genrevector = np.zeros(n)

    if boundarydef == "nonepistolary / epistolary":

        for idx, volID in enumerate(volumeIDs):
            cleanID = utils.pairtreelabel(volID)

            if cleanID in epindices:
                genrevector[idx] = 1
            elif cleanID in nonindices:
                genrevector[idx] = 0
            else:
                print("Error, missing in metadata: " + cleanID)

    return genrevector
Пример #27
0
def get_vocabulary_and_counts(metadata, positiveIDs, negativeIDs, sourcedir, n):
    ''' Gets the top n words by docfrequency in positiveIDs + negativeIDs, but also
    returns a dictionary of wordcounts so we don't have to read them again from the
    file when generating a feature dataframe.
    '''

    allIDs = positiveIDs + negativeIDs

    doc_freq = Counter()
    counts = dict()

    for docid in allIDs:
        counts[docid] = Counter()
        path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
        with open(path, encoding = 'utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                word = row['feature']
                if len(word) < 1:
                    continue

                ct = float(row['count'])

                if word.startswith('#header'):
                    word = word.replace('#header', '')
                #
                # This debatable choice treats header words as equivalent
                # to occurrences in the body text. In practice, this seems
                # to slightly improve performance, at least when you're using
                # SVMs and relatively low numbers of features (140-300).
                # Otherwise header words are in practice just discarded, because
                # e.g. #headeract won't be one of the top 250 words.

                doc_freq[word] += 1
                counts[docid][word] += ct

    vocab = [x[0] for x in doc_freq.most_common(n)]
    print('Vocabulary constructed.')

    return vocab, counts
Пример #28
0
def get_metadata_evidence(htid, rowindices, columns, metadata):
	'''Reads metadata about this volume and uses it to decide what metadata-level features should be assigned.'''

	metadata_evidence = dict()

	metadata_evidence["drama"] = False
	metadata_evidence["poetry"] = False
	metadata_evidence["biography"] = False
	metadata_evidence["fiction"] = False

	htid = utils.pairtreelabel(htid)
	# convert the clean pairtree filename into a dirty pairtree label for metadata matching

	if htid not in rowindices:
		# We have no metadata for this volume.
		return metadata_evidence

	else:
		genrestring = metadata["genres"][htid]
		genreinfo = genrestring.split(";")
		# It's a semicolon-delimited list of items.

		for info in genreinfo:

			if info == "Biography" or info == "Autobiography":
				metadata_evidence["biography"] = True

			if info == "Fiction" or info == "Novel":
				metadata_evidence["fiction"] = True

			if (info == "Poetry" or info == "Poems"):
				metadata_evidence["poetry"] = True

			if (info == "Drama" or info == "Tragedies" or info == "Comedies"):
				metadata_evidence["drama"] = True

	return metadata_evidence
Пример #29
0
def get_metadata_evidence(htid, rowindices, columns, metadata):
	'''Reads metadata about this volume and uses it to decide what metadata-level features should be assigned.'''

	metadata_evidence = dict()

	metadata_evidence["drama"] = False
	metadata_evidence["poetry"] = False
	metadata_evidence["biography"] = False
	metadata_evidence["fiction"] = False

	htid = utils.pairtreelabel(htid)
	# convert the clean pairtree filename into a dirty pairtree label for metadata matching

	if htid not in rowindices:
		# We have no metadata for this volume.
		return metadata_evidence

	else:
		genrestring = metadata["genres"][htid]
		genreinfo = genrestring.split(";")
		# It's a semicolon-delimited list of items.

		for info in genreinfo:

			if info == "Biography" or info == "Autobiography":
				metadata_evidence["biography"] = True

			if info == "Fiction" or info == "Novel":
				metadata_evidence["fiction"] = True

			if (info == "Poetry" or info == "Poems"):
				metadata_evidence["poetry"] = True

			if (info == "Drama" or info == "Tragedies" or info == "Comedies"):
				metadata_evidence["drama"] = True

	return metadata_evidence
Пример #30
0
def get_vocabulary(metadata, positiveIDs, negativeIDs, sourcedir, n):
    ''' Gets the top n words by docfrequency in positiveIDs + negativeIDs.
    '''

    allIDs = positiveIDs + negativeIDs

    doc_freq = Counter()

    for docid in allIDs:
        path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
        with open(path, encoding = 'utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:

                word = row['feature']
                if word.startswith('#header'):
                    word = word.replace('#header', '')

                doc_freq[word] += 1

    vocab = [x[0] for x in doc_freq.most_common(n)]
    print('Vocabulary constructed.')

    return vocab
Пример #31
0
def get_vocabulary(metadata, positiveIDs, negativeIDs, sourcedir, n):
    ''' Gets the top n words by docfrequency in positiveIDs + negativeIDs.
    '''

    allIDs = positiveIDs + negativeIDs

    doc_freq = Counter()

    for docid in allIDs:
        path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
        with open(path, encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:

                word = row['feature']
                if word.startswith('#header'):
                    word = word.replace('#header', '')

                doc_freq[word] += 1

    vocab = [x[0] for x in doc_freq.most_common(n)]
    print('Vocabulary constructed.')

    return vocab
Пример #32
0
            wordcountsbyfile[htid].append(count)
        else:
            wordcountsbyfile[htid] = [count]

    return wordcountsbyfile


# Begin main script.

TOL = 0.1
THRESH = 0.80

genrestocheck = ['fic', 'poe', 'dra']

metadatapath = '/Volumes/TARDIS/work/metadata/MergedMonographs.tsv'
rows, columns, table = utils.readtsv(metadatapath)

firstsource = "/Users/tunder/Dropbox/pagedata/to1923features/genremaps/"
secondsource = "/Users/tunder/Dropbox/pagedata/seventhfeatures/genremaps/"

firstmaps = os.listdir(firstsource)
secondmaps = os.listdir(secondsource)

firstwordcounts = loadwordcounts(firstsource)
secondwordcounts = loadwordcounts(secondsource)

predictsource = '/Users/tunder/Dropbox/pagedata/production/crosspredicts/'

predicts = os.listdir(predictsource)
predicts = [x for x in predicts if not x.startswith('.')]
Пример #33
0
for filename in files2read:
    print(filename)
    filepath = os.path.join(root, filename)
    with open(filepath, encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter='\t')
        cols = reader.fieldnames
        for row in reader:
            if row['language'] != 'eng':
                continue

            if row['startdate'] is None:
                errors += 1
                continue

            inferreddate = utils.date_row(row)
            if inferreddate < 1923 or inferreddate > 2017:
                continue

            genres = set(row['genres'].lower().split('|'))
            if 'fiction' not in genres and 'novel' not in genres and 'short stories' not in genres:
                continue

            docid = row['docid']
            if docid in icdocs:
                alreadyhad += 1
                continue
            else:
                row['inferreddate'] = inferreddate
                rowlist.append(row)
Пример #34
0
def main(sourcedir, metapath, modeldir, outpath, pairtree = False):
    '''
    This function can be called from outside the module; it accepts
    path information and then iterates through all the files it
    finds in the metadata at "metapath."

    If the pairtree flag is True, we assume sourcedir is the root
    of a pairtree structure. Otherwise we assume it's a flat list.
    '''

    global allnames, top1000words

    # We're going to store all the models, by name, in a dictionary:

    models = dict()

    for name in allnames:
        models[name] = loadamodel(modeldir + name)

    # Now get metadata.

    metadata = get_metadata(metapath)

    predictedgenres = []
    predictedprobs = []
    explanations = []
    wordcounts = []
    englishpcts = []

    c = 0
    for docid in metadata.index:
        print(c)
        c += 1

        if pairtree:
            path = get_pairtree(sourcedir, docid)
            counts, error, wordcount = counts4json(path, docid)
        else:
            path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
            counts, error, wordcount = counts4file(path)

        if error == 'success':
            genredict = make_genredict(metadata, docid)
            englishpct = get_english_percent(counts, top1000words)
            genre, probability, explanation = volume_classification(models, counts, genredict)
        else:
            englishpct = 0
            genre = 'NA'
            probability = 0
            explanation = error

        predictedgenres.append(genre)
        predictedprobs.append(probability)
        explanations.append(explanation)
        wordcounts.append(wordcount)
        englishpcts.append(englishpct)

    metadata.loc[ : , 'predictedgenre'] = pd.Series(predictedgenres, index = metadata.index)
    metadata.loc[ : , 'probability'] = pd.Series(predictedprobs, index = metadata.index)
    metadata.loc[ : , 'wordcount'] = pd.Series(wordcounts, index = metadata.index)
    metadata.loc[ : , 'englishpct'] = pd.Series(englishpcts, index = metadata.index)
    metadata.loc[ : , 'explanation'] = pd.Series(explanations, index = metadata.index)

    metadata.to_csv(outpath)
Пример #35
0
    females = text.split('<arr name="htrc_genderFemale">')
    if len(females) > 1:
        name = females[1].split("</str>")[0]
        name = name.replace("<str>", "")
        names.append((name, "f"))

    return (names)


## We start by loading the list of volumes for which we need a
## Library of Congress Call Number.

import SonicScrewdriver as utils

rowindices, columns, metadata = utils.readtsv(
    "/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv")

neededoclcs = list()
reversemap = dict()

for idx in rowindices:
    if metadata["LOCnum"][
            idx] == "<blank>" and metadata["OCLC"][idx] != "<blank>":
        oclc = metadata["OCLC"][idx]
        neededoclcs.append(oclc)
        reversemap[oclc] = idx

counter = 0
metacounter = 0
lccndict = dict()
responsedict = dict()
def maxkey(dictionary):
    tuplelist = utils.sortkeysbyvalue(dictionary, whethertoreverse = True)
    winner = tuplelist[0][1]
    # if winner == "bio":
    #   winner = "non"
    return winner
Пример #37
0
def maxkey(dictionary):
    tuplelist = utils.sortkeysbyvalue(dictionary, whethertoreverse=True)
    winner = tuplelist[0][1]
    # if winner == "bio":
    # 	winner = "non"
    return winner
# Generate Cotraining Set

# This script uses a set of volumes already classified and sorted by a model
# in order to generate additional training data for a new model.

import SonicScrewdriver as utils
from shutil import copyfile

indices, columns, metadata = utils.readtsv("/Volumes/TARDIS/work/cotrain/sortedcotrain.tsv")

toget = indices[-200:]

toget = [utils.pairtreefile(x) for x in toget]

genredir = "/Volumes/TARDIS/work/cotrain/top200/genremaps/"
featuredir = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/"

for htid in toget:

	featuresource = "/Volumes/TARDIS/work/cotrain/pagefeatures/" + htid + ".pg.tsv"
	featuredestination = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/" + htid + ".pg.tsv"
	copyfile(featuresource, featuredestination)

	genresource = "/Volumes/TARDIS/work/cotrain/predictions/" + htid + ".predict"
	genredestination = "/Volumes/TARDIS/work/cotrain/top200/genremaps/" + htid + ".map"
	with open(genresource, mode="r", encoding = "utf-8") as f:
		filelines = f.readlines()

	with open(genredestination, mode="w", encoding = "utf-8") as f:
		for line in filelines:
			line = line.rstrip()
list_of_dataframes = []
idset = set()

list_of_files = args[1:]
root = '../rawdata/'
list_of_paths = [root + x for x in list_of_files]

for p in list_of_paths:
    df = pd.read_csv(p, index_col='docid')
    list_of_dataframes.append(df)
    idset = idset | set(df.index)

ids = []
for anid in idset:
    ids.append(utils.clean_pairtree(str(anid)))

allpaths = set()
with open('/Volumes/TARDIS/work/ef/htrc-ef-all-files.txt',
          encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        allpaths.add(line)

missing = set()
found = set()
mapping = dict()
path2id = dict()

#things we already have:
Пример #40
0
def get_pairtree(pairtreeroot, htid):

    path, postfix = utils.pairtreepath(htid, pairtreeroot)
    wholepath = path + postfix + '/' + postfix + '.json.bz2'

    return wholepath
Пример #41
0
# refine fiction

import SonicScrewdriver as utils

def passfilter(genrestring):
	fields = genrestring.split(';')
	if "Autobiography" in fields or "Biography" in fields:
		return False
	else:
		return True

rows19c, columns19c, table19c = utils.readtsv('/Volumes/TARDIS/work/metadata/19cMetadata.tsv')

rows20c, columns20c, table20c = utils.readtsv('/Volumes/TARDIS/work/metadata/20cMonographMetadata.tsv')

with open("/Users/tunder/Dropbox/GenreProject/python/piketty/roughfiction.txt", encoding = 'utf-8') as f:
	filelines = f.readlines()

idlist = [utils.pairtreelabel(x.split('\t')[0]) for x in filelines]

filteredrows = list()

missing = 0

for anid in idlist:
	if anid in rows19c:
		genrestring = table19c["genres"][anid]
		rowdict = dict()
		for col in columns19c:
			rowdict[col] = table19c[col][anid]
	elif anid in rows20c:
# a newer metadata set.

import csv
import SonicScrewdriver as utils
import random

selecteddates = dict()
selected = list()

reviews = '/Users/tunder/Dropbox/ted/reception/reviewed/lists/ReviewedTitles1840-1859_200.csv'
with open(reviews, encoding = 'utf-8') as f:
    reader = csv.DictReader(f)

    for row in reader:

        htid = utils.clean_pairtree(row['HTid'])
        pubdate = int(row['date'])
        firstpub = int(row['firstpub'])
        yrrev = int(row['yrrev'])

        if pubdate > yrrev + 5:
            date = yrrev
            print(str(pubdate) + " => " + str(yrrev))
        else:
            date = pubdate

        jgenre = row['Jgenre']

        if jgenre == 'poe':
            selecteddates[htid] = date
            selected.append(htid)
Пример #43
0
# sort_anovaset.py

import SonicScrewdriver as utils
import csv

rows, columns, table = utils.readtsv('/Volumes/TARDIS/work/metadata/19cmetadata.tsv')

with open('anovaset.txt', encoding = 'utf-8') as f:
    filelines = f.readlines()
    wholeset = [x.rstrip() for x in filelines]

the19c = list()
the20c = list()

for anid in wholeset:
    if anid in rows:
        the19c.append(anid)
    else:
        the20c.append(anid)

with open('anova19c.txt', mode = 'w', encoding = 'utf-8') as f:
    for anid in the19c:
        f.write(anid + '\n')

with open('anova20c.txt', mode = 'w', encoding = 'utf-8') as f:
    for anid in the20c:
        f.write(anid + '\n')



Пример #44
0
def censor(htid, genresequence):

    htid = utils.pairtreelabel(htid)
    # convert the htid into a dirty pairtree label for metadata matching

    # Create a dictionary with entries for all possible conditions, initially set negative.
    symptoms = [
        "weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial",
        "modelagrees", "modeldisagrees"
    ]
    reported = dict()
    for symptom in symptoms:
        reported[symptom] = 0

    couldbefiction = True

    # Now we need to assess the largest genre in this volume.
    genrecounts = dict()
    genrecounts['fic'] = 0
    genrecounts['poe'] = 0
    genrecounts['dra'] = 0
    genrecounts['non'] = 0

    for page in genresequence:
        indexas = page

        # For this purpose, we treat biography and indexes as equivalent to nonfiction.
        if page == "bio" or page == "index" or page == "back":
            indexas = "non"

        utils.addtodict(indexas, 1, genrecounts)

    # Convert the dictionary of counts into a sorted list, and take the max.
    genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse=True)
    maxgenre = genretuples[0][1]

    if htid not in rowindices and htid not in modelindices:
        return genresequence, reported

    if htid in rowindices:

        genrestring = metadata["genres"][htid]
        genreinfo = genrestring.split(";")
        # It's a semicolon-delimited list of items.

        for info in genreinfo:

            if info == "Biography" or info == "Autobiography":
                couldbefiction = False

            if info == "biog?" and maxgenre == "non":
                reported["weakconfirmation"] = 1
            if info == "biog?" and maxgenre != "non":
                reported["weakdenial"] = 1

            if info == "Not fiction" and maxgenre == "non":
                reported["weakconfirmation"] = 1
            if info == "Not fiction" and maxgenre == "fic":
                reported["weakdenial"] = 1

            if (info == "Fiction" or info == "Novel") and maxgenre == "fic":
                reported["strongconfirmation"] = 1
            if (info == "Fiction" or info == "Novel") and maxgenre != "fic":
                reported["strongdenial"] = 1

            if info == "Biography" and maxgenre == "non":
                reported["strongconfirmation"] = 1
            if info == "Biography" and maxgenre != "non":
                reported["strongdenial"] = 1

            if info == "Autobiography" and maxgenre == "non":
                reported["strongconfirmation"] = 1
            if info == "Autobiography" and maxgenre != "non":
                reported["strongdenial"] = 1

            if (info == "Poetry" or info == "Poems") and maxgenre == "poe":
                reported["strongconfirmation"] = 1
            if (info == "Poetry" or info == "Poems") and maxgenre != "poe":
                reported["strongdenial"] = 1

            if (info == "Drama" or info == "Tragedies"
                    or info == "Comedies") and maxgenre == "dra":
                reported["strongconfirmation"] = 1
            if (info == "Drama" or info == "Tragedies"
                    or info == "Comedies") and maxgenre != "dra":
                reported["strongdenial"] = 1

            if (info == "Catalog" or info == "Dictionary"
                    or info == "Bibliographies") and maxgenre == "non":
                reported["strongconfirmation"] = 1
                couldbefiction = False
            if (info == "Catalog" or info == "Dictionary"
                    or info == "Bibliographies") and maxgenre != "non":
                reported["strongdenial"] = 1
    else:
        print("Skipped.")

    if htid in modelindices:

        modelpredictions = dict()
        for genre, genrecolumn in modeldata.items():
            if not genre in options:
                # this column is not a genre!
                continue
            modelpredictions[genre] = float(genrecolumn[htid])
        predictionlist = utils.sortkeysbyvalue(modelpredictions,
                                               whethertoreverse=True)
        modelprediction = predictionlist[0][1]
        modelconfidence = predictionlist[0][0]
        nextclosest = predictionlist[1][0]
        # Take the top prediction.

        # For purposes of this routine, treat biography as nonfiction:
        if modelprediction == "bio":
            modelprediction = "non"

        if maxgenre == modelprediction:
            reported["modelagrees"] = 1  ## modelconfidence - nextclosest
            reported["modeldisagrees"] = 0
        if maxgenre != modelprediction:
            ## divergence = modelconfidence - modelpredictions[maxgenre]
            reported["modeldisagrees"] = 1
            reported["modelagrees"] = 0
            ## print(maxgenre + " ≠ " + modelprediction)
    else:
        reported["modelagrees"] = 0
        reported["modeldisagrees"] = 0
        modelprediction = "unknown"

    if not couldbefiction:

        numberofpages = len(genresequence)
        for i in range(numberofpages):
            if genresequence[i] == "fic":
                genresequence[i] = "non"

    return genresequence, reported
Пример #45
0
# Uses metadata to help assess degrees

import os, sys
import SonicScrewdriver as utils

rowindices, columns, metadata = utils.readtsv(
    "/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv")

modelindices, modelcolumns, modeldata = utils.readtsv(
    "/Users/tunder/Dropbox/PythonScripts/hathimeta/newgenretable.txt")

options = ["non", "bio", "poe", "dra", "fic"]


def censor(htid, genresequence):

    htid = utils.pairtreelabel(htid)
    # convert the htid into a dirty pairtree label for metadata matching

    # Create a dictionary with entries for all possible conditions, initially set negative.
    symptoms = [
        "weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial",
        "modelagrees", "modeldisagrees"
    ]
    reported = dict()
    for symptom in symptoms:
        reported[symptom] = 0

    couldbefiction = True

    # Now we need to assess the largest genre in this volume.
# CreateStupidPredictions.py

import os, sys
import SonicScrewdriver as utils

rowindices, columns, metadata = utils.readtsv(
    "/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv")

sourcedirectory = "/Users/tunder/Dropbox/pagedata/mixedtraining/genremaps/"

dirlist = os.listdir(sourcedirectory)

validnames = list()

for filename in dirlist:
    if not (filename.startswith(".") or filename.startswith("_")):
        validnames.append(filename)

for filename in validnames:
    filepath = os.path.join(sourcedirectory, filename)

    with open(filepath, mode="r", encoding="utf-8") as f:
        filelines = f.readlines()

    numpages = len(filelines)

    htid = utils.pairtreelabel(filename[0:-4])
    # convert the htid into a dirty pairtree label for metadata matching

    genre = "unknown"
# plotter

import matplotlib.pyplot as plt
import SonicScrewdriver as utils
import pandas as pd
from scipy.stats.stats import pearsonr

indices, columns, agreement = utils.readtsv("/Users/tunder/Dropbox/pagedata/interrater/HumanDissensus.tsv")

indices2, columns2, confidence = utils.readtsv("/Users/tunder/Dropbox/pagedata/interrater/ActualAccuracies.tsv")

for idx in indices:
	if idx not in indices2:
		print(idx + " is missing.")

makeframe = dict()

makeframe["human-agreement"] = agreement["agreement"]
makeframe["machine-accuracy"] = confidence["accuracy"]

df = pd.DataFrame(makeframe, dtype="float")
df = df.dropna()

print(str(pearsonr(df["human-agreement"], df["machine-accuracy"])))

plt.plot(df["human-agreement"], df["machine-accuracy"], "r.")
plt.xlabel("Human agreement")
plt.ylabel("Machine accuracy")
plt.axis([0,1.02,0,1.02])
plt.show()
Пример #48
0
    reader = csv.reader(f)
    for fields in reader:
        idcode = fields[0]
        date = int(fields[8])
        datedict[idcode] = date
        dateset.add(date)

verbose = True

targetwords = {'crown', 'crowns', 'guinea', 'guineas', 'nickel', 'sovereign', 'sovereigns', 'pound', 'pounds', 'quid'}

contexts = []

for filename in filelist:

    htid = utils.pairtreelabel(filename.replace('.txt', ''))

    if htid not in datedict:
        print(htid)
        continue
    else:
        date = datedict[htid]

    filepath = os.path.join(sourcedir, filename)
    with open(filepath, encoding = 'utf-8') as f:
        filelines = f.readlines()
    pagelist = [filelines]

    # The wordcounter module expects a list of pages, each of which is a list of lines.
    # Ebooks have no pages -- at least as I currently receive them -- so we treat it
    # all as one giant page.
Пример #49
0
def main(sourcedir, metapath, modeldir, outpath, pairtree = False):
    '''
    This function can be called from outside the module; it accepts
    path information and then iterates through all the files it
    finds in the metadata at "metapath."

    If the pairtree flag is True, we assume sourcedir is the root
    of a pairtree structure. Otherwise we assume it's a flat list.
    '''

    global allnames, top1000words

    alternatesource = '/projects/ichass/usesofscale/post23/englishmonographs1980-2016/'

    # We're going to store all the models, by name, in a dictionary:

    models = dict()

    modelpaths = glob.glob(modeldir + '*.p')

    for apath in modelpaths:
        name = apath.replace(modeldir, '')
        name = name.replace('.p', '')
        models[name] = loadamodel(apath)

    # Now get metadata.

    metadata = get_metadata(metapath)

    nonficprobs = []
    juvieprobs = []
    wordcounts = []

    c = 0
    for docid in metadata.index:
        print(c)
        c += 1

        if pairtree:
            path1 = get_pairtree(sourcedir, docid)
            path2 = get_pairtree(alternatesource, docid)

            if os.path.isfile(path1):
                chosenpath = path1
            elif os.path.isfile(path2):
                chosenpath = path2
            else:
                print(path1)
                print(path2)
                print('file not found')
                error = 'file not found'
                wordcount = 0

            counts, error, wordcount = counts4json(chosenpath, docid)

        else:
            path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
            counts, error, wordcount = counts4file(path)

        if error == 'success':
            nonficprob, juvenileprob = volume_classification(models, counts)
        else:
            nonficprob = 0.5
            juvenileprob = 0.5

        nonficprobs.append(nonficprob)
        juvieprobs.append(juvenileprob)
        wordcounts.append(wordcount)


    metadata.loc[ : , 'nonficprob'] = pd.Series(nonficprobs, index = metadata.index)
    metadata.loc[ : , 'juvenileprob'] = pd.Series(juvieprobs, index = metadata.index)
    metadata.loc[ : , 'wordcount'] = pd.Series(wordcounts, index = metadata.index)

    metadata.to_csv(outpath)
Пример #50
0
import matplotlib.pyplot as plt
import SonicScrewdriver as utils

targetfile = input('Path to input file? ')

counts = dict()
alltags = set()
alldecades = set()
allcounts = Counter()

with open(targetfile, encoding = 'utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        date = row['date']
        decade = 10 * int(int(date)/10)
        tagset = utils.get_tagset(row['genretags'])
        for tag in tagset:
            if tag == 'chirandom' and ('chiscifi' in tagset):
                continue
            if tag not in counts:
                counts[tag] = Counter()

            counts[tag][decade] += 1
            alltags.add(tag)
            alldecades.add(decade)
            allcounts[decade] += 1

sorted_decades = sorted(list(alldecades))
numdecs = len(sorted_decades)

colors = ['g-', 'b-', 'r-', 'k-', 'ro', 'go', 'bo', 'ko']
Пример #51
0
import csv
from collections import Counter
import SonicScrewdriver as utils

ficids = set()

meta = dict()

ficsource = "/Volumes/TARDIS/work/fiction/metadata/fiction_metadata.csv"
with open(ficsource, encoding="utf-8") as f:
    reader = csv.DictReader(f)
    fieldnames = reader.fieldnames
    for row in reader:
        htid = row["htid"]
        dirtyhtid = utils.dirty_pairtree(htid)
        ficids.add(dirtyhtid)
        meta[dirtyhtid] = row

metasource = "/Volumes/TARDIS/work/metadata/MergedMonographs.tsv"

mysterysubjects = Counter()
scifisubjects = Counter()
gothsubjects = Counter()
gothclues = ["ghost stories", "gothic revival", "horror"]
genretags = dict()


def add_tag(genretags, htid, tagtoadd):
    if htid not in genretags:
        genretags[htid] = set()
modelfolder = "/Volumes/TARDIS/work/moneycontext/"
modelpath = modelfolder + "logisticmodel.p"
with open(modelpath, mode = 'rb') as f:
    logisticmodel = pickle.load(f)

standardizerpath = modelfolder + 'standardizer.p'
with open(standardizerpath, mode = 'rb') as f:
    standardizer = pickle.load(f)

featurepath = modelfolder + 'featurelist.p'
with open(featurepath, mode = 'rb') as f:
    features = pickle.load(f)

# Now load HathiTrust metadata.

rows, columns, table = utils.readtsv('/Volumes/TARDIS/work/metadata/MergedMonographs.tsv')

ambiguouswords = {'crown', 'crowns', 'guinea', 'guineas', 'nickel', 'sovereign', 'sovereigns', 'pound', 'pounds', 'quid'}

moneywords = {'dollar', 'dollars', 'dime', 'dimes', 'nickel', 'nickels', 'pound', 'pounds', 'shilling', 'shillings', 'sovereign', 'sovereigns','cent', 'cents', 'centime', 'centimes', 'crown', 'crowns', 'halfcrown', 'half-crown','penny', 'pennies', 'pence', 'farthing', 'farthings', 'franc', 'francs', 'guilder', 'guilders', 'florin', 'florins', 'guinea', 'guineas', "ha'penny", 'tuppence', 'twopence', 'sixpence', '|arabicprice|', '|price|', 'quid'}

# Words I explicitly decided not to include: 'quarter', 'quarters', 'mark', 'marks.' Monetary uses
# seemed rare enough relative to others that they'd be more likely to introduce noise than to help.
# |arabicprice| is a code the tokenizer in modelingcounter produces whenever it encounters
# a number connected to £, $, ¢, s, or d. In the output we convert that to |price|, for no very
# good reason.

wealthwords = {'fortune', 'fortunes', 'wealth', 'rich', 'riches', 'money', 'moneys', 'fund', 'funds', 'sum', 'sums', 'price', 'prices', 'priced'}

# This is by no means an exhaustive list. Owe, loan, borrowed, etc.
# If we really want to get at the full range of words potentially
Пример #53
0
# print(roughaccuracy)
print("SMOOTHED MICROACCURACY:")
print(smoothaccuracy)
print("COALESCED MICROACCURACY:")
print(coalaccuracy)

with open("/Users/tunder/Dropbox/pagedata/interrater/ActualAccuracies.tsv",
          mode="w",
          encoding="utf-8") as f:
    f.write("htid\taccuracy\n")
    for key, value in accuracies.items():
        outline = key + "\t" + str(value) + "\n"
        f.write(outline)

metadatapath = os.path.join(firstdir, "predictionMetadata.tsv")
rowindices, columns, metadata = utils.readtsv(metadatapath)

metadatatable['maxprob'] = metadata['maxprob']
metadatatable['gap'] = metadata['gap']
metadatatable['accuracy'] = accuracies
metadatatable['dissent'] = dissentperfile

data = pd.DataFrame(metadatatable, dtype="float")

data['intercept'] = 1.0
train_cols = data.columns[1:]
logit = sm.Logit(data['accuracy'], data[train_cols])
result = logit.fit()
print(result.summary())
predictions = result.predict(data[train_cols])
print(pearsonr(data['accuracy'], predictions))
Пример #54
0
    metasource = pd.read_csv(args[1], sep='\t')

    missing = 0

    docstoprocess = metasource.docid

    for idx, docid in enumerate(docstoprocess):

        if idx % 100 == 1:
            print(idx)

        if docid in translations:
            docid = translations[docid]

        path, postfix = utils.pairtreepath(docid, '')
        inpath = rootpath + path + postfix + '/' + utils.clean_pairtree(
            docid) + '.json.bz2'

        if os.path.isfile(inpath):
            pass
        elif 'uc1.b' in docid:
            newdoc = docid.replace('uc1.b', 'uc1.$b')
            path, postfix = utils.pairtreepath(newdoc, '')
            inpath = rootpath + path + postfix + '/' + utils.clean_pairtree(
                newdoc) + '.json.bz2'
            if os.path.isfile(inpath):
                translations[docid] = newdoc
            else:
                missing += 1
                print(missing, inpath, 'not found.')
Пример #55
0
                if thisreader not in readerowners[f]:
                    readerowners[f].append(thisreader)
                    paths[f].append(thispath)

print(len(tagset))

allfiles = tagset
# This is a list of all the filenames (note, filenames not docids)
# that we found in the /readers sourcedir.

train1 = pd.read_csv('../bzipmeta.csv', dtype = 'object', index_col = 'docid')

tidx = set(train1.index.values)
for filename in allfiles:
    docid = filename.replace('.csv', '')
    if utils.dirty_pairtree(docid) not in tidx:
        print(docid)

genrestocheck = ['fic', 'poe']
equivalences = {'non', 'bio', 'other'}

volumesingenre = dict()
for g in genrestocheck:
    volumesingenre[g] = []

alldocids = set()

for filename, owners in readerowners.items():
    path = paths[filename][0]
    if 'metadat' in filename:
        print(filename)
Пример #56
0
# getidstoadd

import SonicScrewdriver as utils
import os

with open('/Users/tunder/Dropbox/GenreProject/python/granger/correctedmeta.tsv', encoding = 'utf-8') as f:
    filelines = f.readlines()

ids2get = [x.split('\t')[0] for x in filelines]

fileswehave = os.listdir('/Users/tunder/Dropbox/GenreProject/python/granger/elite/')
idswehave = set([x.replace('.poe.tsv','') for x in fileswehave if x.endswith('.poe.tsv')])

with open('/Users/tunder/Dropbox/GenreProject/python/granger/ids2get.tsv', mode = 'w', encoding = 'utf-8') as f:
    for anid in ids2get:
        if anid not in idswehave and utils.clean_pairtree(anid) not in idswehave:
            f.write(utils.dirty_pairtree(anid) + '\n')