Пример #1
0
def divide_into_quotations(booklist):

    all_reviewwords, reviewdict = read_pubnames.get_names(
        'brd_pubs_indexed1920s.tsv')
    longreviewnames = set()
    for rev in reviewdict.keys():
        reviewparts = rev.split()
        if len(reviewparts) < 1:
            continue
        elif len(reviewparts[0]) > 4:
            longreviewnames.add(reviewparts[0])

    publishers = [
        'Liverlght', 'Appleton', 'Baker', 'Barnes', 'Benziger', 'Bobbs',
        "Brentano's", 'Cassell', 'Century', 'Collier-Fox', 'Crowell', 'Ditson',
        'Dodd', 'Doran', 'Doubleday', 'Dutton', 'Elder', 'Estes', 'Ginn',
        'Goodspeed', 'Harper', 'Heath', 'Holt', 'Houghton', 'Knopf', 'Lane',
        'Lippincott', 'Little', 'Liveright', 'Longmans', 'Macmillan',
        'McBride', 'McClure', 'McGraw', 'Moffat', 'Oxford', 'Page', 'Pott',
        'Putnam', 'Scribner', 'Simmons', 'Stokes', 'Walton', 'Warne',
        'Wessels', 'Wilde', 'Wiley', 'Winston', 'Yale'
    ]

    lexical_patterns = [('numeric', '.?[0-9]{1,7}.?[0-9]*[,.:=]?'), \
    ('reviewword', all_reviewwords),
    ('openparen', '\(.*'),
    ('closeparen', '.*\)'),
    ('fullstop', '.*\.'),
    ('commastop', '.*\,'),
    ('startdash', '—.*'),
    ('numeric', {'I:', 'II:', 'III:', 'IV:'}),
    ('titlecase', '[A-Z].*'),
    ('monthabbrev', {'Ja', 'F', 'Mr', 'Ap', 'My', 'Je', 'Jl', 'Ag', 'S', 'O', 'N', 'D'}),
    ('lineendingyear', '[\'"•■]\d+'),
    ('volandpgrange', '[0-9]+[:][0-9-]+'),
    ('somenumeric', '.?.?[0-9]{1,7}.?.?[0-9]*.?'),
    ('allcaps', '[A-Z\'\,]+'),
    ('dollarprice', '.*\$.?.?[0-9]{1,7}.?[0-9]*[,.:=]?'),
    ('centprice', '.?.?[0-9]{1,7}.?[0-9]*c+[,.:=]?'),
    ('hyphennumber', '[0-9]{1,3}[-—~]+[0-9]{3,7}[,.:=)]?'),
    ('openquote', '[\"\'“‘]+\S*'),
    ('plusorminus', '[\+\-\—]+'),
    ('reviewword', all_reviewwords),
    ('wordcount', '\d*0w[.]?'),
    ('OCRwordcount', '\S*Ow[.]?')
    ]

    wordcountregex = re.compile('\d*0w[.]?')
    ocrwordcountregex = re.compile('\S*Ow[.]?')

    rule_list = lexparse.patterns2rules(lexical_patterns)
    allquotes = []

    plusmisreads = {
        '-4-', '4-', '1-', '-1-', '4—', '1—', '-|-', '-l-', '-)-', '—|—',
        '-I-', '-(-', '-f'
    }

    for book in booklist:
        lines = book.reviewlines

        accumulated = []
        citationcount = 0

        addtonext = ''
        skipnext = False

        for linecount, line in enumerate(lines):

            # We keep track of linecount because there are
            # characteristic kinds of noise early on, when trailing lines
            # of a citation get treated as part of the review.

            if len(addtonext) > 0:
                line = addtonext + ' ' + line
                addtonext = ''

            if skipnext:
                skipnext = False
                continue

            tokens = line.strip().split()
            if len(tokens) < 1:
                continue

            taglist = lexparse.apply_rule_list(rule_list, tokens)

            # in the first two lines we often have fragments
            # left over from the book bibliographical entry

            if linecount <= 3:
                trailingbibline = False

                for tags in taglist.tagseq:
                    if 'hyphennumber' in tags or 'dollarprice' in tags or 'centprice' in tags:
                        trailingbibline = True

                if trailingbibline:

                    # get the existing publisher to see if it makes more sense
                    # fused with something in this trailing line

                    existingpubparts = book.publisher.split()
                    if len(existingpubparts) > 0:
                        existingpub = existingpubparts[-1].strip('-')
                    else:
                        existingpub = 'not a publisher'

                    tokenssofar = []
                    for l in accumulated:
                        tokenssofar.extend(l.strip().split())
                    tokenssofar.extend(tokens)

                    tokenssofar = [x.strip('.,[]()-') for x in tokenssofar]

                    for tok in tokenssofar:
                        if tok in publishers:
                            book.publisher = tok

                        rejoined = existingpub + tok
                        if rejoined in publishers:
                            book.publisher = book.publisher.strip('-') + tok

                    line = line + ' <endsubj>'
                    accumulated.append(line)
                    continue

            # Sometimes a book is followed by a summary that
            # is not attributed to any particular review.
            # The only way I have to identify this is,
            # that a) this is the first sequence of lines and
            # b) the next line opens with a quotation mark,
            # and there has been no other citation info provided
            # yet.

            if citationcount == 0 and len(accumulated) > 3:
                if 'openquote' in taglist.tagseq[0]:
                    sentiment = ''
                    review = 'summary'
                    cite = 'summary'
                    citationcount += 1
                    quote = Quotation(book, review, sentiment, cite,
                                      accumulated)
                    allquotes.append(quote)
                    accumulated = []
                    accumulated.append(line)
                    # this line (opening with a quote) will be part of the next quotation
                    matched = True
                    continue

            numberwords = 0
            reviewwords = 0
            plusyet = False
            totalclues = 0

            for word, tags in zip(taglist.stringseq, taglist.tagseq):
                if 'reviewword' in tags:
                    reviewwords += 1
                    totalclues += 1

                elif 'plusorminus' in tags and not plusyet:
                    reviewwords += 0.5
                    totalclues += 1
                    plusyet = True

                elif 'monthabbrev' in tags:
                    totalclues += 1

                elif 'somenumeric' in tags and not '-' in word and not ',' in word:
                    numberwords += 1
                    totalclues += 1
                    if word.endswith('w'):
                        totalclues += 1
                        reviewwords += 0.5
                    elif ':' in word:
                        totalclues += 1
                        reviewwords += 0.5
                    elif word.startswith('p'):
                        totalclues += 1
                        reviewwords += 0.5

            # fuzzy match in situations where everything is there except the review
            # because it could easily be ocr error

            if numberwords > 0 and totalclues > 2 and reviewwords < 0.9:
                firstword = taglist.stringseq[0]
                if len(firstword) > 3:
                    for longname in longreviewnames:
                        similarity = match_strings(firstword, longname)
                        if similarity > .7:
                            reviewwords += 1
                            totalclues += 1
                            break

            if numberwords > 0 and reviewwords > 0.9 and totalclues > 3:
                sentimentbits = []

                numericyet = False
                publisherbits = []
                citationbits = []

                nextwordctr = 0

                for word, tags in zip(taglist.stringseq, taglist.tagseq):

                    nextwordctr += 1

                    if not numericyet and word == '+':
                        sentimentbits.append('+')
                        continue
                    if not numericyet and word in plusmisreads:
                        # e.g. '4-' is a fairly common ocr misread for +
                        sentimentbits.append('+')
                        continue
                    if not numericyet and (word == '-' or word == '—'
                                           or word == '—-'):
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '=-' or word == '--'
                                           or word == '-—'):
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '==' or word == '=--'
                                           or word == '—-'):
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '+-' or word == '+—'
                                           or word == '+='):
                        sentimentbits.append('+')
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '-+' or word == "—+"
                                           or word == '=+'):
                        sentimentbits.append('-')
                        sentimentbits.append('+')
                        continue
                    if not numericyet and (word == '++-' or word == '++—'
                                           or word == "++="):
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        continue
                        sentimentbits.append('-')
                    if not numericyet and (word == '++'):
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        continue
                    if not numericyet and (word == '+++'):
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        continue
                    if not numericyet and nextwordctr == 1 and word == "H":
                        # this is a weird but common misread; however, it's risky
                        # enough that we should only do it in first position
                        sentimentbits.append('+')
                        sentimentbits.append('-')
                        continue

                    if 'somenumeric' in tags:
                        numericyet = True

                    if not numericyet:
                        publisherbits.append(word)
                    else:
                        citationbits.append(word)

                    if numericyet and ('wordcount' in tags or 'OCRwordcount'
                                       in tags) and (nextwordctr < len(
                                           taglist.stringseq)):
                        addtonext = ' '.join(taglist.stringseq[nextwordctr:])
                        break

                # if this line doesn't end with a word count, and the next one does?
                # probably a continuation

                if len(citationbits) > 0 and not wordcountregex.fullmatch(
                        citationbits[-1]) and not ocrwordcountregex.fullmatch(
                            citationbits[-1]):
                    if linecount < (len(lines) - 1):
                        wordsinnextline = lines[linecount + 1].strip().split()
                        if len(wordsinnextline) > 0 and len(
                                wordsinnextline
                        ) < 3 and wordcountregex.fullmatch(
                                wordsinnextline[-1]):
                            citationbits.extend(wordsinnextline)
                            skipnext = True

                sentiment = ' '.join(sentimentbits)
                review = ' '.join(publisherbits)
                cite = ' '.join(citationbits)
                citationcount += 1

                quote = Quotation(book, review, sentiment, cite, accumulated)
                allquotes.append(quote)
                accumulated = []

            else:
                # odds of review 1 or less
                accumulated.append(line)

    return allquotes
Пример #2
0
def divide_into_quotations(booklist):

    all_reviewwords, reviewdict = read_pubnames.get_names(
        'brd_pubs_indexed1920s.tsv')
    reviewnames = set(reviewdict.keys())

    lexical_patterns = [('numeric', '.?[0-9]{1,7}.?[0-9]*[,.:=]?'), \
    ('reviewword', all_reviewwords),
    ('openparen', '\(.*'),
    ('closeparen', '.*\)'),
    ('fullstop', '.*\.'),
    ('commastop', '.*\,'),
    ('startdash', '—.*'),
    ('numeric', {'I:', 'II:', 'III:', 'IV:'}),
    ('titlecase', '[A-Z].*'),
    ('monthabbrev', {'Ja', 'F', 'Mr', 'Ap', 'My', 'Je', 'Jl', 'Ag', 'S', 'O', 'N', 'D'}),
    ('lineendingyear', '[\'"•■]\d+'),
    ('volandpgrange', '[0-9]+[:][0-9-]+'),
    ('somenumeric', '.?.?[0-9]{1,7}.?.?[0-9]*.?'),
    ('allcaps', '[A-Z\'\,]+'),
    ('dollarprice', '.*\$.?.?[0-9]{1,7}.?[0-9]*[,.:=]?'),
    ('centprice', '.?.?[0-9]{1,7}.?[0-9]*c+[,.:=]?'),
    ('hyphennumber', '[0-9]{1,3}[-—~]+[0-9]{3,7}[,.:=)]?'),
    ('openquote', '[\"\'“‘]+\S*'),
    ('plusorminus', '[\+\-\—]+'),
    ('reviewword', all_reviewwords),
    ('wordcount', '\d*0w[.]?')
    ]

    wordcountregex = re.compile('\d*0w[.]?')

    rule_list = lexparse.patterns2rules(lexical_patterns)
    allquotes = []

    plusmisreads = {
        '-4-', '4-', '1-', '-1-', '4—', '1—', '-|-', '-l-', '-)-', '—|—',
        '-I-', '-(-', '-f'
    }

    for book in booklist:
        lines = book.reviewlines

        accumulated = []
        citationcount = 0

        addtonext = ''
        skipnext = False

        for linecount, line in enumerate(lines):

            # We keep track of linecount because there are
            # characteristic kinds of noise early on, when trailing lines
            # of a citation get treated as part of the review.

            if len(addtonext) > 0:
                line = addtonext + ' ' + line
                addtonext = ''

            if skipnext:
                skipnext = False
                continue

            tokens = line.strip().split()
            if len(tokens) < 1:
                continue

            taglist = lexparse.apply_rule_list(rule_list, tokens)

            # in the first two lines we often have fragments
            # left over from the book bibliographical entry

            if linecount <= 3:
                trailingbibline = False

                for tags in taglist.tagseq:
                    if 'hyphennumber' in tags or 'dollarprice' in tags or 'centprice' in tags:
                        trailingbibline = True
                if trailingbibline:
                    line = line + ' <endsubj>'
                    accumulated.append(line)
                    continue

            # Sometimes a book is followed by a summary that
            # is not attributed to any particular review.
            # The only way I have to identify this is,
            # that a) this is the first sequence of lines and
            # b) the next line opens with a quotation mark,
            # and there has been no other citation info provided
            # yet.

            if citationcount == 0 and len(accumulated) > 3:
                if 'openquote' in taglist.tagseq[0]:
                    sentiment = ''
                    review = 'summary'
                    cite = 'summary'
                    citationcount += 1
                    quote = Quotation(book, review, sentiment, cite,
                                      accumulated)
                    allquotes.append(quote)
                    accumulated = []
                    accumulated.append(line)
                    # this line (opening with a quote) will be part of the next quotation
                    matched = True
                    continue

            oddsofreview = 0
            reviewwordyet = False

            for word, tags in zip(taglist.stringseq, taglist.tagseq):
                if 'reviewword' in tags and not reviewwordyet:
                    oddsofreview += 1
                    reviewwordyet = True
                if 'plusorminus' in tags:
                    oddsofreview += 1
                if 'somenumeric' in tags and not '-' in word and not ',' in word:
                    oddsofreview += 1

            if (oddsofreview > 1 and linecount > 1) or oddsofreview > 2:
                sentimentbits = []

                numericyet = False
                publisherbits = []
                citationbits = []

                nextwordctr = 0

                for word, tags in zip(taglist.stringseq, taglist.tagseq):

                    nextwordctr += 1

                    if not numericyet and word == '+':
                        sentimentbits.append('+')
                        continue
                    if not numericyet and word in plusmisreads:
                        # e.g. '4-' is a fairly common ocr misread for +
                        sentimentbits.append('+')
                        continue
                    if not numericyet and (word == '-' or word == '—'
                                           or word == '—-'):
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '=-' or word == '--'
                                           or word == '-—'):
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '==' or word == '=--'
                                           or word == '—-'):
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '+-' or word == '+—'
                                           or word == '+='):
                        sentimentbits.append('+')
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '-+' or word == "—+"
                                           or word == '=+'):
                        sentimentbits.append('-')
                        sentimentbits.append('+')
                        continue
                    if not numericyet and (word == '++-' or word == '++—'
                                           or word == "++="):
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        continue
                        sentimentbits.append('-')
                    if not numericyet and (word == '++'):
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        continue
                    if not numericyet and (word == '+++'):
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        continue
                    if not numericyet and nextwordctr == 1 and word == "H":
                        # this is a weird but common misread; however, it's risky
                        # enough that we should only do it in first position
                        sentimentbits.append('+')
                        sentimentbits.append('-')
                        continue

                    if 'somenumeric' in tags:
                        numericyet = True

                    if not numericyet:
                        publisherbits.append(word)
                    else:
                        citationbits.append(word)

                    if numericyet and 'wordcount' in tags and (
                            nextwordctr < len(taglist.stringseq)):
                        addtonext = ' '.join(taglist.stringseq[nextwordctr:])
                        break

                # if this line doesn't end with a word count, and the next one does?
                # probably a continuation

                if len(citationbits) > 0 and not wordcountregex.fullmatch(
                        citationbits[-1]):
                    if linecount < (len(lines) - 1):
                        wordsinnextline = lines[linecount + 1].strip().split()
                        if len(wordsinnextline
                               ) > 0 and wordcountregex.fullmatch(
                                   wordsinnextline[-1]):
                            citationbits.extend(wordsinnextline)
                            skipnext = True

                sentiment = ' '.join(sentimentbits)
                review = ' '.join(publisherbits)
                cite = ' '.join(citationbits)
                citationcount += 1

                quote = Quotation(book, review, sentiment, cite, accumulated)
                allquotes.append(quote)
                accumulated = []

            else:
                # odds of review 1 or less
                accumulated.append(line)

    return allquotes
Пример #3
0
# publisherfinder.py

import glob, sys, csv, re
from collections import Counter

anynumregex = re.compile('\S{0,3}\d\S{0,3}')

stopwords = {'the', 'from', 'of', 'and'}

args = sys.argv

year = args[1]

import read_pubnames

all_reviewwords, reviewdict = read_pubnames.get_names('brd_pubs_indexed1940s.tsv')

all_review_codes = set([x.replace('.', '') for x in reviewdict.keys()])
all_review_codes.add('summary')

publishers = []

with open('40spublishers.txt', encoding = 'utf-8') as f:
    for line in f:
        publishers.append(line.strip())

targetpaths = glob.glob('/media/secure_volume/brd/output/' +year + '*.tsv')

if len(targetpaths) > 0:
    targetpath = targetpaths[0]