Exemplo n.º 1
0
def get_citations(page, rule_list, start_headword, end_headword):
    print()
    print('Next page.')
    tagged_page = lexparse.apply_rule_list(rule_list, page)
    citation_list = divide_into_citations(tagged_page)

    if len(start_headword) < 1:
        startinitial = 'a'
    else:
        startinitial = start_headword[0].lower()

    if len(end_headword) < 1:
        endinitial = 'z'
    else:
        endinitial = end_headword[0].lower()

    for cite in citation_list:
        parse_citation(cite, startinitial, endinitial)

    return citation_list
Exemplo n.º 2
0
def divide_into_quotations(booklist):

    all_reviewwords, reviewdict = read_pubnames.get_names(
        'brd_pubs_indexed1920s.tsv')
    reviewnames = set(reviewdict.keys())

    lexical_patterns = [('numeric', '.?[0-9]{1,7}.?[0-9]*[,.:=]?'), \
    ('reviewword', all_reviewwords),
    ('openparen', '\(.*'),
    ('closeparen', '.*\)'),
    ('fullstop', '.*\.'),
    ('commastop', '.*\,'),
    ('startdash', '—.*'),
    ('numeric', {'I:', 'II:', 'III:', 'IV:'}),
    ('titlecase', '[A-Z].*'),
    ('monthabbrev', {'Ja', 'F', 'Mr', 'Ap', 'My', 'Je', 'Jl', 'Ag', 'S', 'O', 'N', 'D'}),
    ('lineendingyear', '[\'"•■]\d+'),
    ('volandpgrange', '[0-9]+[:][0-9-]+'),
    ('somenumeric', '.?.?[0-9]{1,7}.?.?[0-9]*.?'),
    ('allcaps', '[A-Z\'\,]+'),
    ('dollarprice', '.*\$.?.?[0-9]{1,7}.?[0-9]*[,.:=]?'),
    ('centprice', '.?.?[0-9]{1,7}.?[0-9]*c+[,.:=]?'),
    ('hyphennumber', '[0-9]{1,3}[-—~]+[0-9]{3,7}[,.:=)]?'),
    ('openquote', '[\"\'“‘]+\S*'),
    ('plusorminus', '[\+\-\—]+'),
    ('reviewword', all_reviewwords),
    ('wordcount', '\d*0w[.]?')
    ]

    wordcountregex = re.compile('\d*0w[.]?')

    rule_list = lexparse.patterns2rules(lexical_patterns)
    allquotes = []

    plusmisreads = {
        '-4-', '4-', '1-', '-1-', '4—', '1—', '-|-', '-l-', '-)-', '—|—',
        '-I-', '-(-', '-f'
    }

    for book in booklist:
        lines = book.reviewlines

        accumulated = []
        citationcount = 0

        addtonext = ''
        skipnext = False

        for linecount, line in enumerate(lines):

            # We keep track of linecount because there are
            # characteristic kinds of noise early on, when trailing lines
            # of a citation get treated as part of the review.

            if len(addtonext) > 0:
                line = addtonext + ' ' + line
                addtonext = ''

            if skipnext:
                skipnext = False
                continue

            tokens = line.strip().split()
            if len(tokens) < 1:
                continue

            taglist = lexparse.apply_rule_list(rule_list, tokens)

            # in the first two lines we often have fragments
            # left over from the book bibliographical entry

            if linecount <= 3:
                trailingbibline = False

                for tags in taglist.tagseq:
                    if 'hyphennumber' in tags or 'dollarprice' in tags or 'centprice' in tags:
                        trailingbibline = True
                if trailingbibline:
                    line = line + ' <endsubj>'
                    accumulated.append(line)
                    continue

            # Sometimes a book is followed by a summary that
            # is not attributed to any particular review.
            # The only way I have to identify this is,
            # that a) this is the first sequence of lines and
            # b) the next line opens with a quotation mark,
            # and there has been no other citation info provided
            # yet.

            if citationcount == 0 and len(accumulated) > 3:
                if 'openquote' in taglist.tagseq[0]:
                    sentiment = ''
                    review = 'summary'
                    cite = 'summary'
                    citationcount += 1
                    quote = Quotation(book, review, sentiment, cite,
                                      accumulated)
                    allquotes.append(quote)
                    accumulated = []
                    accumulated.append(line)
                    # this line (opening with a quote) will be part of the next quotation
                    matched = True
                    continue

            oddsofreview = 0
            reviewwordyet = False

            for word, tags in zip(taglist.stringseq, taglist.tagseq):
                if 'reviewword' in tags and not reviewwordyet:
                    oddsofreview += 1
                    reviewwordyet = True
                if 'plusorminus' in tags:
                    oddsofreview += 1
                if 'somenumeric' in tags and not '-' in word and not ',' in word:
                    oddsofreview += 1

            if (oddsofreview > 1 and linecount > 1) or oddsofreview > 2:
                sentimentbits = []

                numericyet = False
                publisherbits = []
                citationbits = []

                nextwordctr = 0

                for word, tags in zip(taglist.stringseq, taglist.tagseq):

                    nextwordctr += 1

                    if not numericyet and word == '+':
                        sentimentbits.append('+')
                        continue
                    if not numericyet and word in plusmisreads:
                        # e.g. '4-' is a fairly common ocr misread for +
                        sentimentbits.append('+')
                        continue
                    if not numericyet and (word == '-' or word == '—'
                                           or word == '—-'):
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '=-' or word == '--'
                                           or word == '-—'):
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '==' or word == '=--'
                                           or word == '—-'):
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '+-' or word == '+—'
                                           or word == '+='):
                        sentimentbits.append('+')
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '-+' or word == "—+"
                                           or word == '=+'):
                        sentimentbits.append('-')
                        sentimentbits.append('+')
                        continue
                    if not numericyet and (word == '++-' or word == '++—'
                                           or word == "++="):
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        continue
                        sentimentbits.append('-')
                    if not numericyet and (word == '++'):
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        continue
                    if not numericyet and (word == '+++'):
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        continue
                    if not numericyet and nextwordctr == 1 and word == "H":
                        # this is a weird but common misread; however, it's risky
                        # enough that we should only do it in first position
                        sentimentbits.append('+')
                        sentimentbits.append('-')
                        continue

                    if 'somenumeric' in tags:
                        numericyet = True

                    if not numericyet:
                        publisherbits.append(word)
                    else:
                        citationbits.append(word)

                    if numericyet and 'wordcount' in tags and (
                            nextwordctr < len(taglist.stringseq)):
                        addtonext = ' '.join(taglist.stringseq[nextwordctr:])
                        break

                # if this line doesn't end with a word count, and the next one does?
                # probably a continuation

                if len(citationbits) > 0 and not wordcountregex.fullmatch(
                        citationbits[-1]):
                    if linecount < (len(lines) - 1):
                        wordsinnextline = lines[linecount + 1].strip().split()
                        if len(wordsinnextline
                               ) > 0 and wordcountregex.fullmatch(
                                   wordsinnextline[-1]):
                            citationbits.extend(wordsinnextline)
                            skipnext = True

                sentiment = ' '.join(sentimentbits)
                review = ' '.join(publisherbits)
                cite = ' '.join(citationbits)
                citationcount += 1

                quote = Quotation(book, review, sentiment, cite, accumulated)
                allquotes.append(quote)
                accumulated = []

            else:
                # odds of review 1 or less
                accumulated.append(line)

    return allquotes
Exemplo n.º 3
0
def get_books(pagelist, publishers):

    lexical_patterns = [('numeric', '.?[0-9]{1,7}.?[0-9]*[,.:=]?'), \
    ('genreword', {'reviewed', 'by', 'review.', 'review', 'by,',
        'article.', 'article', 'article-non', 'lit.', 'non-lit.',
        'poem.', 'fict.', 'fiction', 'fict', 'fiction.', 'poem'}),
    ('openparen', '\(.*'),
    ('closeparen', '.*\)'),
    ('fullstop', '\S+[\.\?!]'),
    ('commastop', '.*\,'),
    ('startdash', '—.*'),
    ('numeric', {'I:', 'II:', 'III:', 'IV:'}),
    ('titlecase', '[A-Z].*'),
    ('monthabbrev', {'Ja', 'F', 'Mr', 'Ap', 'My', 'Je', 'Jl', 'Ag', 'S', 'O', 'N', 'D'}),
    ('lineendingyear', '[\'"•■]\d+'),
    ('volandpgrange', '[0-9]+[:][0-9-]+'),
    ('somenumeric', '.?.?.?[0-9]{1,7}.?.?[0-9]*.?'),
    ('allcaps', '[[A-Z\'\,\‘\.\-:;]*[A-Z]{2,}[A-Z\'\,\‘\.\:;]*'),
    ('dollarprice', '.{0,2}[$\"\'\“].?.?[0-9]{1,7}.?[0-9]*[,.:=]?'),
    ('centprice', '.?.?[0-9]{1,7}.?[0-9]*c+[,.:=]?'),
    ('hyphennumber', '.?[0-9]{1,2}[-—~]+[0-9]{3,7}.?'),
    ('openquote', '[\"\'“‘]+\S*'),
    ('deweydecimal', '[0-9]{3}[.][0-9-]+'),
    ('numpages', '\d{2,5}p')
    ]

    rule_list = lexparse.patterns2rules(lexical_patterns)

    # Our strategy is to find pairs of lines that bookend a citation. A
    # citation and the lines that follow it (before the next citation)
    # is going to count as an Author.

    # A citation starts with a line whose first word is either capitalized and
    # followed by a comma, or all uppercase and longer than three characters.
    # It should be alphabetically later than the last author-name. Exceptions
    # are flagged.

    # A citation ends with a line (up to five lines later) that contains
    # $ + number or number + c, or number containing a hyphen, or that ends
    # with a publisher.

    # We create a list of author_names as well as a list of Author objects.

    author_errors = []
    books = []

    # We keep track of the last author name we've seen; it's going to be
    # the name that governs the lines we are currently examining.

    last_author_name = ''

    textpage = 1

    reviewlines = []
    citation_started = False
    citation_finished = False

    citationlines = []
    governing_citation = Citation(['Anonymous. My book. $1.00. Macmillan.'],
                                  rule_list, textpage)
    aligned = 0

    for pagenum, page in enumerate(pagelist):

        for linenum, line in enumerate(page):

            line = line.strip()

            this_line_is_new_citation = False

            # Line numbers are only relevant in guiding us to ignore the running header,
            # and to update the page number. This will be imperfect, because OCR,
            # but that's okay. If we get four successive pages that increment each
            # other as they should (10, 11, 12, 13, ...), we say that enough have been aligned to
            # override checking and just keep adding one per page.

            if aligned >= 4 and linenum == 0:
                textpage += 1

            if linenum < 4:
                thematch = match_strings('BOOK REVIEW DIGEST', line)
                if thematch > 0.8 and len(line) > 7:
                    wordsinline = line.split()
                    if len(wordsinline) > 3 and wordsinline[0].isdigit():
                        pagenum = int(wordsinline[0])
                    elif len(wordsinline) > 3 and wordsinline[-1].isdigit():
                        pagenum = int(wordsinline[-1])

                    if textpage + 1 == pagenum:
                        aligned += 1
                        textpage = pagenum
                    elif pagenum < 20:
                        textpage = pagenum

                    continue
                    # skip this line

                if len(line) > 15:
                    thetail = line[-11:]
                    thematch = match_strings('—Continued.', thetail)
                    if thematch > 0.75:
                        continue

                if line.isdigit() and aligned < 4:
                    try:
                        newtextpage = int(line)
                        if textpage + 1 == newtextpage:
                            aligned += 1
                        else:
                            aligned = 0
                        textpage = newtextpage
                    except:
                        textpage += 1

                elif len(line) > 12 and line[-2:].isdigit() and aligned < 4:
                    words = line.split()
                    if words[-1].isdigit():
                        pagenumpart = words[-1]
                        try:
                            newtextpage = int(pagenumpart)
                            if textpage + 1 == newtextpage:
                                aligned += 1
                            else:
                                aligned = 0
                            textpage = newtextpage
                        except:
                            textpage += 1

                elif len(line) > 12 and line[0:2].isdigit() and aligned < 4:
                    words = line.split()
                    if words[0].isdigit():
                        pagenumpart = words[0]
                        try:
                            newtextpage = int(pagenumpart)
                            if textpage + 1 == newtextpage:
                                aligned += 1
                            else:
                                aligned = 0
                            textpage = newtextpage
                        except:
                            textpage += 1

            if line.startswith('Figures in parenth'):
                continue

            tokens = line.split()
            if len(tokens) < 1:
                continue

            # There are things that look like the beginning of a citation but
            # are actually cross-references

            if "See" in line:
                skepticalofcitestart = True
            else:
                skepticalofcitestart = False

            nextline = linenum + 1

            if nextline < len(page) and "See" in page[nextline]:
                skepticalofcitestart = True

            cannotcitestart = False

            if skepticalofcitestart and linenum + 5 < len(page):
                for lookforward in range(1, 5):
                    futureline = page[linenum + lookforward]
                    if percent_upper(futureline) > .3:
                        cannotcitestart = True

            taglist = lexparse.apply_rule_list(rule_list, tokens)

            firstword = taglist.stringseq[0]
            firsttagset = taglist.tagseq[0]

            cluescitationahead = 0

            distancetolook = 6

            if (len(page) - linenum) < distancetolook:
                distancetolook = len(page) - linenum

            if 'allcaps' in firsttagset and len(
                    firstword) > 2 and distancetolook > 1:
                for lookforward in range(1, distancetolook):
                    futureline = page[linenum + lookforward]
                    if '$' in futureline or "ed." in futureline:
                        cluescitationahead += 1
                    if cluescitationahead > 0:
                        futuretokens = futureline.split()
                        for t in futuretokens:
                            if hyphenregex.fullmatch(t):
                                cluescitationahead += 1

            if not citation_started and not cannotcitestart:

                allcapcount = 0
                for tags in taglist.tagseq:
                    if 'allcaps' in tags:
                        allcapcount += 1

                lineuppercasepct = percent_upper(line)

                if (allcapcount > 0 and lineuppercasepct > 0.1
                        and len(line) > 9) or (
                            lineuppercasepct > 0.6
                            and len(line) > 9) or (allcapcount > 0
                                                   and cluescitationahead > 0
                                                   and len(line) > 9):

                    percentageupper = percent_upper(firstword)
                    if len(line) > 15:
                        pctupin15 = percent_upper(line[0:15])
                    else:
                        pctupin15 = 0

                    if 'allcaps' in firsttagset and len(
                            firstword) > 2 and cluescitationahead > 0:
                        this_line_is_new_citation = True
                    elif lineuppercasepct > 0.72 and len(line) > 14:
                        this_line_is_new_citation = True
                    elif pctupin15 > .35 and ('$' in line or cluescitationahead
                                              > 1) and len(reviewlines) > 3:
                        this_line_is_new_citation = True
                    elif percentageupper > 0.7 and len(
                            firstword) > 4 and allcapcount > 2:
                        this_line_is_new_citation = True
                    elif pctupin15 > 0.65:
                        this_line_is_new_citation = True
                    else:
                        reviewlines.append(line)

                else:
                    reviewlines.append(line)

                if this_line_is_new_citation:
                    # a new citation has begun
                    citation_finished = False

                    citationlines = []

                    for string, tags in zip(taglist.stringseq, taglist.tagseq):
                        if 'dollarprice' in tags or 'centprice' in tags:
                            # note that our conditions for ending a citation with the first
                            # line are more stringent than they will be from the second onward
                            citation_finished = True
                            break

            elif not citation_started and cannotcitestart:
                reviewlines.append(line)

            else:
                # if a citation has been started, let's see if we should end it

                for string, tags in zip(taglist.stringseq, taglist.tagseq):
                    if 'dollarprice' in tags or 'centprice' in tags or 'hyphennumber' in tags or 'deweydecimal' in tags:
                        # more conditions can end a citation now
                        citation_finished = True
                        break

                if len(taglist.stringseq) > 1 and taglist.stringseq[-1].strip(
                        '.') in publishers:
                    # sometimes there's no price and the publisher's name is the only clue
                    # that the citation is finished
                    citation_finished = True

                if len(citationlines) > 2 and len(
                        taglist.tagseq
                ) > 1 and 'somenumeric' in taglist.tagseq[0]:
                    try:
                        deweydecimal = float(taglist.stringseq[0])
                        if deweydecimal > 99:
                            citation_finished = True
                    except:
                        pass

            if this_line_is_new_citation or citation_started:
                citationlines.append(line)
                citation_started = True
                this_line_is_new_citation = False

            if citation_finished:
                # we have concluded a new citation
                # first, make the last citation into a book:

                thisbook = Book(governing_citation, reviewlines)
                books.append(thisbook)

                # initialize reviewlines, and create a new citation
                reviewlines = []
                citation_started = False
                citation_finished = False
                # we finished that citation, started a new one

                governing_citation = Citation(citationlines, rule_list,
                                              textpage)
                citationlines = []

                new_author = governing_citation.author
                if new_author < last_author_name:
                    author_errors.append(
                        (textpage, last_author_name, new_author))
                last_author_name = new_author

            elif len(citationlines) > 8:
                # this is too many lines, and we were probably in error to have
                # started the citation, so put those lines back in reviewlines.
                # This is esp. likely to happen at the top of a page, when
                # an entry is "continued."

                reviewlines.extend(citationlines)
                citationlines = []
                citation_started = False

            elif len(citationlines) > 2:
                lineuppercasepct = percent_upper(line)
                lastuppercasepct = percent_upper(citationlines[-2])

                if lineuppercasepct > .45 and cluescitationahead > 0 and len(
                        line) > 12 and lastuppercasepct < .45:
                    # we started a citation in error two or more lines back; this is the actual
                    # citation start
                    # notice that we check the pct uppercase of last line to make sure this isn't
                    # just a long multiline author name!

                    # discarded = citationlines[0: -1]
                    # for d in discarded:
                    #     print(d)
                    citationlines = [citationlines[-1]]

    return books, author_errors
Exemplo n.º 4
0
    def __init__(self, linelist, rule_list, textpage):
        self.pagenum = textpage

        alltuples = []
        for line in linelist:
            tokens = line.strip().split()
            if len(tokens) < 1:
                continue

            taglist = lexparse.apply_rule_list(rule_list, tokens)
            for astring, tags in zip(taglist.stringseq, taglist.tagseq):
                alltuples.append((astring, tags))

        titlestart = False
        titledone = False
        authordone = False
        authorstop = False
        dollarpricefound = False

        # The underlying logic here is elegant. We take words up to the first
        # full stop as the "author." From there to the next full stop is
        # the "title." Except, well, in cases of initials.
        # Since "Adams, B. V." has two periods, the rule is that we need
        # a non-fullstopped word or a word of more than three characters to trigger
        # 'title."

        # To implement that we need two flags:
        #   authorstop -- we have reached a full stop
        #   authordone -- we also hit a subsequent word that lacks a period
        #                 or is more than three chars long

        title = []
        author = []
        price = 0
        publisher = []
        tokenssincenumpages = 3

        for word, tags in alltuples:

            tokenssincenumpages += 1

            if authorstop and not authordone:
                if word.startswith('eds.') or word.startswith('pseud.'):
                    author.append(word)
                elif len(word) > 1 and numcaps(word) / len(word) < 1:
                    authordone = True
                    if word[0].isupper():
                        titlestart = True
                        title.append(word)
                else:
                    author.append(word)

            elif not authordone:
                author.append(word)
                if 'fullstop' in tags:
                    authorstop = True
                elif len(word) > 1 and numcaps(word) / len(word) < 0.6:
                    authorstop = True

            elif not titledone:

                if word[0].isupper():
                    titlestart = True

                if titlestart and 'fullstop' in tags:
                    titledone = True

                if titlestart and 'dollarprice' in tags and not 'numpages' in tags:
                    price = pricetranslate(word)
                    if '$' in word:
                        dollarpricefound = True
                    titledone = True

                if titlestart and 'numpages' in tags:
                    titledone = True
                    publisher.append(word)
                    tokenssincenumpages = 0

                else:
                    title.append(word)

            else:
                if titlestart and 'numpages' in tags:
                    publisher.append(word)
                    tokenssincenumpages = 0

                elif tokenssincenumpages < 3 and 'somenumeric' in tags:
                    if '$' in word:
                        dollarpricefound = True
                    price = aggressivepricetranslate(word)

                elif 'dollarprice' in tags:
                    tryprice = pricetranslate(word)
                    if not dollarpricefound:
                        price = tryprice
                    if '$' in word:
                        dollarpricefound = True
                elif 'centprice' in tags and not dollarpricefound:
                    price = pricetranslate(word)
                elif word.strip(
                        "'‘’*t") in valid_prices and not dollarpricefound:
                    price = valid_prices[word.strip("'‘’*t")]
                else:
                    publisher.append(word)

        self.author = ' '.join(author)
        self.title = ' '.join(title)
        self.publisher = ' '.join(publisher)
        self.price = price
Exemplo n.º 5
0
def divide_into_quotations(booklist):

    all_reviewwords, reviewdict = read_pubnames.get_names(
        'brd_pubs_indexed1920s.tsv')
    longreviewnames = set()
    for rev in reviewdict.keys():
        reviewparts = rev.split()
        if len(reviewparts) < 1:
            continue
        elif len(reviewparts[0]) > 4:
            longreviewnames.add(reviewparts[0])

    publishers = [
        'Liverlght', 'Appleton', 'Baker', 'Barnes', 'Benziger', 'Bobbs',
        "Brentano's", 'Cassell', 'Century', 'Collier-Fox', 'Crowell', 'Ditson',
        'Dodd', 'Doran', 'Doubleday', 'Dutton', 'Elder', 'Estes', 'Ginn',
        'Goodspeed', 'Harper', 'Heath', 'Holt', 'Houghton', 'Knopf', 'Lane',
        'Lippincott', 'Little', 'Liveright', 'Longmans', 'Macmillan',
        'McBride', 'McClure', 'McGraw', 'Moffat', 'Oxford', 'Page', 'Pott',
        'Putnam', 'Scribner', 'Simmons', 'Stokes', 'Walton', 'Warne',
        'Wessels', 'Wilde', 'Wiley', 'Winston', 'Yale'
    ]

    lexical_patterns = [('numeric', '.?[0-9]{1,7}.?[0-9]*[,.:=]?'), \
    ('reviewword', all_reviewwords),
    ('openparen', '\(.*'),
    ('closeparen', '.*\)'),
    ('fullstop', '.*\.'),
    ('commastop', '.*\,'),
    ('startdash', '—.*'),
    ('numeric', {'I:', 'II:', 'III:', 'IV:'}),
    ('titlecase', '[A-Z].*'),
    ('monthabbrev', {'Ja', 'F', 'Mr', 'Ap', 'My', 'Je', 'Jl', 'Ag', 'S', 'O', 'N', 'D'}),
    ('lineendingyear', '[\'"•■]\d+'),
    ('volandpgrange', '[0-9]+[:][0-9-]+'),
    ('somenumeric', '.?.?[0-9]{1,7}.?.?[0-9]*.?'),
    ('allcaps', '[A-Z\'\,]+'),
    ('dollarprice', '.*\$.?.?[0-9]{1,7}.?[0-9]*[,.:=]?'),
    ('centprice', '.?.?[0-9]{1,7}.?[0-9]*c+[,.:=]?'),
    ('hyphennumber', '[0-9]{1,3}[-—~]+[0-9]{3,7}[,.:=)]?'),
    ('openquote', '[\"\'“‘]+\S*'),
    ('plusorminus', '[\+\-\—]+'),
    ('reviewword', all_reviewwords),
    ('wordcount', '\d*0w[.]?'),
    ('OCRwordcount', '\S*Ow[.]?')
    ]

    wordcountregex = re.compile('\d*0w[.]?')
    ocrwordcountregex = re.compile('\S*Ow[.]?')

    rule_list = lexparse.patterns2rules(lexical_patterns)
    allquotes = []

    plusmisreads = {
        '-4-', '4-', '1-', '-1-', '4—', '1—', '-|-', '-l-', '-)-', '—|—',
        '-I-', '-(-', '-f'
    }

    for book in booklist:
        lines = book.reviewlines

        accumulated = []
        citationcount = 0

        addtonext = ''
        skipnext = False

        for linecount, line in enumerate(lines):

            # We keep track of linecount because there are
            # characteristic kinds of noise early on, when trailing lines
            # of a citation get treated as part of the review.

            if len(addtonext) > 0:
                line = addtonext + ' ' + line
                addtonext = ''

            if skipnext:
                skipnext = False
                continue

            tokens = line.strip().split()
            if len(tokens) < 1:
                continue

            taglist = lexparse.apply_rule_list(rule_list, tokens)

            # in the first two lines we often have fragments
            # left over from the book bibliographical entry

            if linecount <= 3:
                trailingbibline = False

                for tags in taglist.tagseq:
                    if 'hyphennumber' in tags or 'dollarprice' in tags or 'centprice' in tags:
                        trailingbibline = True

                if trailingbibline:

                    # get the existing publisher to see if it makes more sense
                    # fused with something in this trailing line

                    existingpubparts = book.publisher.split()
                    if len(existingpubparts) > 0:
                        existingpub = existingpubparts[-1].strip('-')
                    else:
                        existingpub = 'not a publisher'

                    tokenssofar = []
                    for l in accumulated:
                        tokenssofar.extend(l.strip().split())
                    tokenssofar.extend(tokens)

                    tokenssofar = [x.strip('.,[]()-') for x in tokenssofar]

                    for tok in tokenssofar:
                        if tok in publishers:
                            book.publisher = tok

                        rejoined = existingpub + tok
                        if rejoined in publishers:
                            book.publisher = book.publisher.strip('-') + tok

                    line = line + ' <endsubj>'
                    accumulated.append(line)
                    continue

            # Sometimes a book is followed by a summary that
            # is not attributed to any particular review.
            # The only way I have to identify this is,
            # that a) this is the first sequence of lines and
            # b) the next line opens with a quotation mark,
            # and there has been no other citation info provided
            # yet.

            if citationcount == 0 and len(accumulated) > 3:
                if 'openquote' in taglist.tagseq[0]:
                    sentiment = ''
                    review = 'summary'
                    cite = 'summary'
                    citationcount += 1
                    quote = Quotation(book, review, sentiment, cite,
                                      accumulated)
                    allquotes.append(quote)
                    accumulated = []
                    accumulated.append(line)
                    # this line (opening with a quote) will be part of the next quotation
                    matched = True
                    continue

            numberwords = 0
            reviewwords = 0
            plusyet = False
            totalclues = 0

            for word, tags in zip(taglist.stringseq, taglist.tagseq):
                if 'reviewword' in tags:
                    reviewwords += 1
                    totalclues += 1

                elif 'plusorminus' in tags and not plusyet:
                    reviewwords += 0.5
                    totalclues += 1
                    plusyet = True

                elif 'monthabbrev' in tags:
                    totalclues += 1

                elif 'somenumeric' in tags and not '-' in word and not ',' in word:
                    numberwords += 1
                    totalclues += 1
                    if word.endswith('w'):
                        totalclues += 1
                        reviewwords += 0.5
                    elif ':' in word:
                        totalclues += 1
                        reviewwords += 0.5
                    elif word.startswith('p'):
                        totalclues += 1
                        reviewwords += 0.5

            # fuzzy match in situations where everything is there except the review
            # because it could easily be ocr error

            if numberwords > 0 and totalclues > 2 and reviewwords < 0.9:
                firstword = taglist.stringseq[0]
                if len(firstword) > 3:
                    for longname in longreviewnames:
                        similarity = match_strings(firstword, longname)
                        if similarity > .7:
                            reviewwords += 1
                            totalclues += 1
                            break

            if numberwords > 0 and reviewwords > 0.9 and totalclues > 3:
                sentimentbits = []

                numericyet = False
                publisherbits = []
                citationbits = []

                nextwordctr = 0

                for word, tags in zip(taglist.stringseq, taglist.tagseq):

                    nextwordctr += 1

                    if not numericyet and word == '+':
                        sentimentbits.append('+')
                        continue
                    if not numericyet and word in plusmisreads:
                        # e.g. '4-' is a fairly common ocr misread for +
                        sentimentbits.append('+')
                        continue
                    if not numericyet and (word == '-' or word == '—'
                                           or word == '—-'):
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '=-' or word == '--'
                                           or word == '-—'):
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '==' or word == '=--'
                                           or word == '—-'):
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '+-' or word == '+—'
                                           or word == '+='):
                        sentimentbits.append('+')
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '-+' or word == "—+"
                                           or word == '=+'):
                        sentimentbits.append('-')
                        sentimentbits.append('+')
                        continue
                    if not numericyet and (word == '++-' or word == '++—'
                                           or word == "++="):
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        continue
                        sentimentbits.append('-')
                    if not numericyet and (word == '++'):
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        continue
                    if not numericyet and (word == '+++'):
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        continue
                    if not numericyet and nextwordctr == 1 and word == "H":
                        # this is a weird but common misread; however, it's risky
                        # enough that we should only do it in first position
                        sentimentbits.append('+')
                        sentimentbits.append('-')
                        continue

                    if 'somenumeric' in tags:
                        numericyet = True

                    if not numericyet:
                        publisherbits.append(word)
                    else:
                        citationbits.append(word)

                    if numericyet and ('wordcount' in tags or 'OCRwordcount'
                                       in tags) and (nextwordctr < len(
                                           taglist.stringseq)):
                        addtonext = ' '.join(taglist.stringseq[nextwordctr:])
                        break

                # if this line doesn't end with a word count, and the next one does?
                # probably a continuation

                if len(citationbits) > 0 and not wordcountregex.fullmatch(
                        citationbits[-1]) and not ocrwordcountregex.fullmatch(
                            citationbits[-1]):
                    if linecount < (len(lines) - 1):
                        wordsinnextline = lines[linecount + 1].strip().split()
                        if len(wordsinnextline) > 0 and len(
                                wordsinnextline
                        ) < 3 and wordcountregex.fullmatch(
                                wordsinnextline[-1]):
                            citationbits.extend(wordsinnextline)
                            skipnext = True

                sentiment = ' '.join(sentimentbits)
                review = ' '.join(publisherbits)
                cite = ' '.join(citationbits)
                citationcount += 1

                quote = Quotation(book, review, sentiment, cite, accumulated)
                allquotes.append(quote)
                accumulated = []

            else:
                # odds of review 1 or less
                accumulated.append(line)

    return allquotes
def subdivide_author(auth, rule_list):
    ''' This function accepts an Author, which contains a list of lines
    paired with group_tags to identify "WORKS BY" or "WORKS ABOUT." *Every* line in a
    WORKS ABOUT section is expected to bear the tag "about."

    First this function turns the lines
    into TaggedLists (a class from the lexparse module.)

    Then it iterates through tuples of the form
            (TaggedList, group_tag)

    and aggregates them into Citations.
    '''

    author_name = auth.name
    print(author_name)
    rawlines = auth.get_lines()

    last_tag = 'match-any'

    citation_list = []

    tokenchunk = []

    tagged_group = 'not yet used'

    for line, group_tag in rawlines:
        tokens = line.strip().split()
        if len(tokens) < 1:
            continue
        # what to do with the tokens depends on
        # where we are in a tag sequence

        if last_tag == 'match-any':
            # turn them into a new TaggedList (the first for this author)
            tagged_group = lexparse.apply_rule_list(rule_list, tokens)
            last_tag = group_tag

        elif group_tag == last_tag:
            # group_tag remains the same (WORKS ABOUT or WORKS BY)
            # so just extend the group

            new_group = lexparse.apply_rule_list(rule_list, tokens)
            tagged_group.extend(new_group)

        else:
            # there has been a shift of tag so let's divide the
            # groups

            new_citations = divide_into_citations(tagged_group, last_tag,
                                                  author_name)
            citation_list.extend(new_citations)

            # and create a new tagged_group
            tagged_group = lexparse.apply_rule_list(rule_list, tokens)
            # and make this tag into the last_tag
            last_tag = group_tag

    # when we're done, since there's not a next tag to trigger the
    # division of tagged_group, we have to do it explicitly

    if type(tagged_group) != str:
        # slightly cheesy way of confirming that it's a TaggedList
        new_citations = divide_into_citations(tagged_group, last_tag,
                                              author_name)
        citation_list.extend(new_citations)

    return citation_list
Exemplo n.º 7
0
def parse_pages(pagefiles, rule_list, prev_entry = 'A'):

    for p in pagefiles:
        ## first read in a page and tag all the strings in each line

        with open(p, encoding = 'utf-8') as f:
            filelines = f.readlines()
            tagged_page = []

            for fl in filelines:
                tokens = fl.strip().split()
                if len(tokens) < 1:
                    continue
                else:
                    tagged_line = lexparse.apply_rule_list(rule_list, tokens)
                    tagged_page.append(tagged_line)

        ## now identify the lines themselves as entries or as citations

        linetuples = []
        for index, line in enumerate(tagged_page):
            if index < 2 and is_header(line):
                linetag = 'header'
            elif is_citation_end(line):
                linetag = 'citationend'
            elif has_citation(line):
                linetag = 'citationpart'
            elif is_entry(line):
                linetag = 'entry'
            elif is_all_text(line):
                linetag = 'alltext'
            else:
                linetag = 'ambiguous'

            linetuples.append((linetag, line))


        # Now we organize lines into groups that share an entry.
        # First, use alphabetical sequence to confirm entries. We're going create
        # a list of lines that we think are entries.

        entrypoints = []

        # We're going to rely on the variable prev_entry, inerited from
        # the previous page, to make sure
        # that entries are in alphabetical order. But we also need a
        # way to correct errors if we get off.

        lowerthanprev = 0
        allentries = 1
        # note a bit of additive smoothing
        firstonpage = 'Aa'

        for ltuple in linetuples:
            tag, line = ltuple
            if tag == 'entry':
                firstword = line.stringseq[0]

                if firstonpage == 'Aa':
                    firstonpage = firstword
                    # 'Aa' is just a flag that we haven't taken the firstonpage yet

                allentries += 1
                if firstword < prev_entry:
                    lowerthanprev += 1

        if (lowerthanprev / allentries) > 0.5:
            prev_entry = firstonpage

            # If more than half the entries on the page begin with a word
            # alphabetically earlier than prev_entry, we have gotten out of
            # order, and the prev_entry needs to be reset to the first on page.

        for idx, ltuple in enumerate(linetuples):
            tag, line = ltuple
            firstword = line.stringseq[0]
            if tag == 'entry' and firstword >= prev_entry:
                entrypoints.append(idx)
                prev_entry = firstword
            elif tag == 'alltext' and line.stringseq[0] >= prev_entry:
                for idx2 in range(idx +1, len(linetuples)):
                    tag2, line2 = linetuples[idx2]
                    if tag2 == 'entry' and line2.stringseq[0] >= firstword:
                        entrypoints.append(idx)
                        prev_entry = firstword
                        break
                    elif tag2 == 'entry':
                        break
            else:
                continue

        # okay, now we have a list of lines that we think are entries.
        # we can use that to create chunks of lines that share the same
        # entry

        chunks = []
        for idx, entrypoint in enumerate(entrypoints):
            linenum = entrypoint
            if idx + 1 >= len(entrypoints):
                chunks.append((entrypoint, len(linetuples)))
            else:
                for idx2 in range(idx + 1, len(entrypoints)):
                    linenum2 = entrypoints[idx2]
                    if linenum2 = linenum + 1:
                        # sequential entries should be kept together
                        linenum = linenum2
                    else:
                        # aha, a break
                        chunks.append((entrypoint, linenum2))
                        break

        citations = []
        for chunktuple in chunks:
            startline, stopline = chunktuple
            new_chunk = linetuples[startline: stopline]
            new_citations = parse_chunk(new_chunk)
            citations.extend(new_citations)







        current_header = ''
        for cite in citations:
            main_subject = cite.get_subject('main')
            if main_subject.startswith('<head'):
                cite.set_subject('main', current_header)
            else:
                current_header = main_subject

        outpath = p.replace('clean', 'results').replace('.txt', '.tsv')
        columns = ['mainhed', 'subhed', 'author', 'journalname', 'volandpg', 'fullcite']
        with open(outpath, mode = 'w', encoding = 'utf-8') as f:
            writer = csv.DictWriter(f, delimiter = '\t', fieldnames = columns)
            writer.writeheader()
            for cite in citations:
                outrow = cite.get_outrow()
                writer.writerow(outrow)