def authors_to_citations(author_list):

    lexical_patterns = [('numeric', '.?[0-9]{1,7}.?[0-9]*[,.:=]?'), \
    ('genreword', {'reviewed', 'by', 'review.', 'review', 'by,',
        'article.', 'article', 'article-non', 'lit.', 'non-lit.',
        'poem.', 'fict.', 'fiction', 'fict', 'fiction.', 'poem'}),
    ('openparen', '\(.*'),
    ('closeparen', '.*\)'),
    ('fullstop', '.*\.'),
    ('commastop', '.*\,'),
    ('startdash', '—.*'),
    ('numeric', {'I:', 'II:', 'III:', 'IV:'}),
    ('titlecase', '[A-Z].*'),
    ('monthabbrev', {'Ja', 'F', 'Mr', 'Ap', 'My', 'Je', 'Jl', 'Ag', 'S', 'O', 'N', 'D'}),
    ('lineendingyear', '[\'"•■]\d+'),
    ('volandpgrange', '[0-9]+[:][0-9-]+'),
    ('somenumeric', '.?.?[0-9]{1,7}.?.?[0-9]*.?'),
    ('allcaps', '[A-Z\']+')
    ]

    rule_list = lexparse.patterns2rules(lexical_patterns)

    all_parsed_citations = []

    for auth in author_list:
        citation_list = subdivide_author(auth, rule_list)
        parse_parts(citation_list)
        all_parsed_citations.extend(citation_list)

    return all_parsed_citations
def divide_into_quotations(booklist):

    all_reviewwords, reviewdict = read_pubnames.get_names(
        'brd_pubs_indexed1920s.tsv')
    reviewnames = set(reviewdict.keys())

    lexical_patterns = [('numeric', '.?[0-9]{1,7}.?[0-9]*[,.:=]?'), \
    ('reviewword', all_reviewwords),
    ('openparen', '\(.*'),
    ('closeparen', '.*\)'),
    ('fullstop', '.*\.'),
    ('commastop', '.*\,'),
    ('startdash', '—.*'),
    ('numeric', {'I:', 'II:', 'III:', 'IV:'}),
    ('titlecase', '[A-Z].*'),
    ('monthabbrev', {'Ja', 'F', 'Mr', 'Ap', 'My', 'Je', 'Jl', 'Ag', 'S', 'O', 'N', 'D'}),
    ('lineendingyear', '[\'"•■]\d+'),
    ('volandpgrange', '[0-9]+[:][0-9-]+'),
    ('somenumeric', '.?.?[0-9]{1,7}.?.?[0-9]*.?'),
    ('allcaps', '[A-Z\'\,]+'),
    ('dollarprice', '.*\$.?.?[0-9]{1,7}.?[0-9]*[,.:=]?'),
    ('centprice', '.?.?[0-9]{1,7}.?[0-9]*c+[,.:=]?'),
    ('hyphennumber', '[0-9]{1,3}[-—~]+[0-9]{3,7}[,.:=)]?'),
    ('openquote', '[\"\'“‘]+\S*'),
    ('plusorminus', '[\+\-\—]+'),
    ('reviewword', all_reviewwords),
    ('wordcount', '\d*0w[.]?')
    ]

    wordcountregex = re.compile('\d*0w[.]?')

    rule_list = lexparse.patterns2rules(lexical_patterns)
    allquotes = []

    plusmisreads = {
        '-4-', '4-', '1-', '-1-', '4—', '1—', '-|-', '-l-', '-)-', '—|—',
        '-I-', '-(-', '-f'
    }

    for book in booklist:
        lines = book.reviewlines

        accumulated = []
        citationcount = 0

        addtonext = ''
        skipnext = False

        for linecount, line in enumerate(lines):

            # We keep track of linecount because there are
            # characteristic kinds of noise early on, when trailing lines
            # of a citation get treated as part of the review.

            if len(addtonext) > 0:
                line = addtonext + ' ' + line
                addtonext = ''

            if skipnext:
                skipnext = False
                continue

            tokens = line.strip().split()
            if len(tokens) < 1:
                continue

            taglist = lexparse.apply_rule_list(rule_list, tokens)

            # in the first two lines we often have fragments
            # left over from the book bibliographical entry

            if linecount <= 3:
                trailingbibline = False

                for tags in taglist.tagseq:
                    if 'hyphennumber' in tags or 'dollarprice' in tags or 'centprice' in tags:
                        trailingbibline = True
                if trailingbibline:
                    line = line + ' <endsubj>'
                    accumulated.append(line)
                    continue

            # Sometimes a book is followed by a summary that
            # is not attributed to any particular review.
            # The only way I have to identify this is,
            # that a) this is the first sequence of lines and
            # b) the next line opens with a quotation mark,
            # and there has been no other citation info provided
            # yet.

            if citationcount == 0 and len(accumulated) > 3:
                if 'openquote' in taglist.tagseq[0]:
                    sentiment = ''
                    review = 'summary'
                    cite = 'summary'
                    citationcount += 1
                    quote = Quotation(book, review, sentiment, cite,
                                      accumulated)
                    allquotes.append(quote)
                    accumulated = []
                    accumulated.append(line)
                    # this line (opening with a quote) will be part of the next quotation
                    matched = True
                    continue

            oddsofreview = 0
            reviewwordyet = False

            for word, tags in zip(taglist.stringseq, taglist.tagseq):
                if 'reviewword' in tags and not reviewwordyet:
                    oddsofreview += 1
                    reviewwordyet = True
                if 'plusorminus' in tags:
                    oddsofreview += 1
                if 'somenumeric' in tags and not '-' in word and not ',' in word:
                    oddsofreview += 1

            if (oddsofreview > 1 and linecount > 1) or oddsofreview > 2:
                sentimentbits = []

                numericyet = False
                publisherbits = []
                citationbits = []

                nextwordctr = 0

                for word, tags in zip(taglist.stringseq, taglist.tagseq):

                    nextwordctr += 1

                    if not numericyet and word == '+':
                        sentimentbits.append('+')
                        continue
                    if not numericyet and word in plusmisreads:
                        # e.g. '4-' is a fairly common ocr misread for +
                        sentimentbits.append('+')
                        continue
                    if not numericyet and (word == '-' or word == '—'
                                           or word == '—-'):
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '=-' or word == '--'
                                           or word == '-—'):
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '==' or word == '=--'
                                           or word == '—-'):
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '+-' or word == '+—'
                                           or word == '+='):
                        sentimentbits.append('+')
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '-+' or word == "—+"
                                           or word == '=+'):
                        sentimentbits.append('-')
                        sentimentbits.append('+')
                        continue
                    if not numericyet and (word == '++-' or word == '++—'
                                           or word == "++="):
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        continue
                        sentimentbits.append('-')
                    if not numericyet and (word == '++'):
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        continue
                    if not numericyet and (word == '+++'):
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        continue
                    if not numericyet and nextwordctr == 1 and word == "H":
                        # this is a weird but common misread; however, it's risky
                        # enough that we should only do it in first position
                        sentimentbits.append('+')
                        sentimentbits.append('-')
                        continue

                    if 'somenumeric' in tags:
                        numericyet = True

                    if not numericyet:
                        publisherbits.append(word)
                    else:
                        citationbits.append(word)

                    if numericyet and 'wordcount' in tags and (
                            nextwordctr < len(taglist.stringseq)):
                        addtonext = ' '.join(taglist.stringseq[nextwordctr:])
                        break

                # if this line doesn't end with a word count, and the next one does?
                # probably a continuation

                if len(citationbits) > 0 and not wordcountregex.fullmatch(
                        citationbits[-1]):
                    if linecount < (len(lines) - 1):
                        wordsinnextline = lines[linecount + 1].strip().split()
                        if len(wordsinnextline
                               ) > 0 and wordcountregex.fullmatch(
                                   wordsinnextline[-1]):
                            citationbits.extend(wordsinnextline)
                            skipnext = True

                sentiment = ' '.join(sentimentbits)
                review = ' '.join(publisherbits)
                cite = ' '.join(citationbits)
                citationcount += 1

                quote = Quotation(book, review, sentiment, cite, accumulated)
                allquotes.append(quote)
                accumulated = []

            else:
                # odds of review 1 or less
                accumulated.append(line)

    return allquotes
示例#3
0
def get_books(pagelist, publishers):

    lexical_patterns = [('numeric', '.?[0-9]{1,7}.?[0-9]*[,.:=]?'), \
    ('genreword', {'reviewed', 'by', 'review.', 'review', 'by,',
        'article.', 'article', 'article-non', 'lit.', 'non-lit.',
        'poem.', 'fict.', 'fiction', 'fict', 'fiction.', 'poem'}),
    ('openparen', '\(.*'),
    ('closeparen', '.*\)'),
    ('fullstop', '\S+[\.\?!]'),
    ('commastop', '.*\,'),
    ('startdash', '—.*'),
    ('numeric', {'I:', 'II:', 'III:', 'IV:'}),
    ('titlecase', '[A-Z].*'),
    ('monthabbrev', {'Ja', 'F', 'Mr', 'Ap', 'My', 'Je', 'Jl', 'Ag', 'S', 'O', 'N', 'D'}),
    ('lineendingyear', '[\'"•■]\d+'),
    ('volandpgrange', '[0-9]+[:][0-9-]+'),
    ('somenumeric', '.?.?.?[0-9]{1,7}.?.?[0-9]*.?'),
    ('allcaps', '[[A-Z\'\,\‘\.\-:;]*[A-Z]{2,}[A-Z\'\,\‘\.\:;]*'),
    ('dollarprice', '.{0,2}[$\"\'\“].?.?[0-9]{1,7}.?[0-9]*[,.:=]?'),
    ('centprice', '.?.?[0-9]{1,7}.?[0-9]*c+[,.:=]?'),
    ('hyphennumber', '.?[0-9]{1,2}[-—~]+[0-9]{3,7}.?'),
    ('openquote', '[\"\'“‘]+\S*'),
    ('deweydecimal', '[0-9]{3}[.][0-9-]+'),
    ('numpages', '\d{2,5}p')
    ]

    rule_list = lexparse.patterns2rules(lexical_patterns)

    # Our strategy is to find pairs of lines that bookend a citation. A
    # citation and the lines that follow it (before the next citation)
    # is going to count as an Author.

    # A citation starts with a line whose first word is either capitalized and
    # followed by a comma, or all uppercase and longer than three characters.
    # It should be alphabetically later than the last author-name. Exceptions
    # are flagged.

    # A citation ends with a line (up to five lines later) that contains
    # $ + number or number + c, or number containing a hyphen, or that ends
    # with a publisher.

    # We create a list of author_names as well as a list of Author objects.

    author_errors = []
    books = []

    # We keep track of the last author name we've seen; it's going to be
    # the name that governs the lines we are currently examining.

    last_author_name = ''

    textpage = 1

    reviewlines = []
    citation_started = False
    citation_finished = False

    citationlines = []
    governing_citation = Citation(['Anonymous. My book. $1.00. Macmillan.'],
                                  rule_list, textpage)
    aligned = 0

    for pagenum, page in enumerate(pagelist):

        for linenum, line in enumerate(page):

            line = line.strip()

            this_line_is_new_citation = False

            # Line numbers are only relevant in guiding us to ignore the running header,
            # and to update the page number. This will be imperfect, because OCR,
            # but that's okay. If we get four successive pages that increment each
            # other as they should (10, 11, 12, 13, ...), we say that enough have been aligned to
            # override checking and just keep adding one per page.

            if aligned >= 4 and linenum == 0:
                textpage += 1

            if linenum < 4:
                thematch = match_strings('BOOK REVIEW DIGEST', line)
                if thematch > 0.8 and len(line) > 7:
                    wordsinline = line.split()
                    if len(wordsinline) > 3 and wordsinline[0].isdigit():
                        pagenum = int(wordsinline[0])
                    elif len(wordsinline) > 3 and wordsinline[-1].isdigit():
                        pagenum = int(wordsinline[-1])

                    if textpage + 1 == pagenum:
                        aligned += 1
                        textpage = pagenum
                    elif pagenum < 20:
                        textpage = pagenum

                    continue
                    # skip this line

                if len(line) > 15:
                    thetail = line[-11:]
                    thematch = match_strings('—Continued.', thetail)
                    if thematch > 0.75:
                        continue

                if line.isdigit() and aligned < 4:
                    try:
                        newtextpage = int(line)
                        if textpage + 1 == newtextpage:
                            aligned += 1
                        else:
                            aligned = 0
                        textpage = newtextpage
                    except:
                        textpage += 1

                elif len(line) > 12 and line[-2:].isdigit() and aligned < 4:
                    words = line.split()
                    if words[-1].isdigit():
                        pagenumpart = words[-1]
                        try:
                            newtextpage = int(pagenumpart)
                            if textpage + 1 == newtextpage:
                                aligned += 1
                            else:
                                aligned = 0
                            textpage = newtextpage
                        except:
                            textpage += 1

                elif len(line) > 12 and line[0:2].isdigit() and aligned < 4:
                    words = line.split()
                    if words[0].isdigit():
                        pagenumpart = words[0]
                        try:
                            newtextpage = int(pagenumpart)
                            if textpage + 1 == newtextpage:
                                aligned += 1
                            else:
                                aligned = 0
                            textpage = newtextpage
                        except:
                            textpage += 1

            if line.startswith('Figures in parenth'):
                continue

            tokens = line.split()
            if len(tokens) < 1:
                continue

            # There are things that look like the beginning of a citation but
            # are actually cross-references

            if "See" in line:
                skepticalofcitestart = True
            else:
                skepticalofcitestart = False

            nextline = linenum + 1

            if nextline < len(page) and "See" in page[nextline]:
                skepticalofcitestart = True

            cannotcitestart = False

            if skepticalofcitestart and linenum + 5 < len(page):
                for lookforward in range(1, 5):
                    futureline = page[linenum + lookforward]
                    if percent_upper(futureline) > .3:
                        cannotcitestart = True

            taglist = lexparse.apply_rule_list(rule_list, tokens)

            firstword = taglist.stringseq[0]
            firsttagset = taglist.tagseq[0]

            cluescitationahead = 0

            distancetolook = 6

            if (len(page) - linenum) < distancetolook:
                distancetolook = len(page) - linenum

            if 'allcaps' in firsttagset and len(
                    firstword) > 2 and distancetolook > 1:
                for lookforward in range(1, distancetolook):
                    futureline = page[linenum + lookforward]
                    if '$' in futureline or "ed." in futureline:
                        cluescitationahead += 1
                    if cluescitationahead > 0:
                        futuretokens = futureline.split()
                        for t in futuretokens:
                            if hyphenregex.fullmatch(t):
                                cluescitationahead += 1

            if not citation_started and not cannotcitestart:

                allcapcount = 0
                for tags in taglist.tagseq:
                    if 'allcaps' in tags:
                        allcapcount += 1

                lineuppercasepct = percent_upper(line)

                if (allcapcount > 0 and lineuppercasepct > 0.1
                        and len(line) > 9) or (
                            lineuppercasepct > 0.6
                            and len(line) > 9) or (allcapcount > 0
                                                   and cluescitationahead > 0
                                                   and len(line) > 9):

                    percentageupper = percent_upper(firstword)
                    if len(line) > 15:
                        pctupin15 = percent_upper(line[0:15])
                    else:
                        pctupin15 = 0

                    if 'allcaps' in firsttagset and len(
                            firstword) > 2 and cluescitationahead > 0:
                        this_line_is_new_citation = True
                    elif lineuppercasepct > 0.72 and len(line) > 14:
                        this_line_is_new_citation = True
                    elif pctupin15 > .35 and ('$' in line or cluescitationahead
                                              > 1) and len(reviewlines) > 3:
                        this_line_is_new_citation = True
                    elif percentageupper > 0.7 and len(
                            firstword) > 4 and allcapcount > 2:
                        this_line_is_new_citation = True
                    elif pctupin15 > 0.65:
                        this_line_is_new_citation = True
                    else:
                        reviewlines.append(line)

                else:
                    reviewlines.append(line)

                if this_line_is_new_citation:
                    # a new citation has begun
                    citation_finished = False

                    citationlines = []

                    for string, tags in zip(taglist.stringseq, taglist.tagseq):
                        if 'dollarprice' in tags or 'centprice' in tags:
                            # note that our conditions for ending a citation with the first
                            # line are more stringent than they will be from the second onward
                            citation_finished = True
                            break

            elif not citation_started and cannotcitestart:
                reviewlines.append(line)

            else:
                # if a citation has been started, let's see if we should end it

                for string, tags in zip(taglist.stringseq, taglist.tagseq):
                    if 'dollarprice' in tags or 'centprice' in tags or 'hyphennumber' in tags or 'deweydecimal' in tags:
                        # more conditions can end a citation now
                        citation_finished = True
                        break

                if len(taglist.stringseq) > 1 and taglist.stringseq[-1].strip(
                        '.') in publishers:
                    # sometimes there's no price and the publisher's name is the only clue
                    # that the citation is finished
                    citation_finished = True

                if len(citationlines) > 2 and len(
                        taglist.tagseq
                ) > 1 and 'somenumeric' in taglist.tagseq[0]:
                    try:
                        deweydecimal = float(taglist.stringseq[0])
                        if deweydecimal > 99:
                            citation_finished = True
                    except:
                        pass

            if this_line_is_new_citation or citation_started:
                citationlines.append(line)
                citation_started = True
                this_line_is_new_citation = False

            if citation_finished:
                # we have concluded a new citation
                # first, make the last citation into a book:

                thisbook = Book(governing_citation, reviewlines)
                books.append(thisbook)

                # initialize reviewlines, and create a new citation
                reviewlines = []
                citation_started = False
                citation_finished = False
                # we finished that citation, started a new one

                governing_citation = Citation(citationlines, rule_list,
                                              textpage)
                citationlines = []

                new_author = governing_citation.author
                if new_author < last_author_name:
                    author_errors.append(
                        (textpage, last_author_name, new_author))
                last_author_name = new_author

            elif len(citationlines) > 8:
                # this is too many lines, and we were probably in error to have
                # started the citation, so put those lines back in reviewlines.
                # This is esp. likely to happen at the top of a page, when
                # an entry is "continued."

                reviewlines.extend(citationlines)
                citationlines = []
                citation_started = False

            elif len(citationlines) > 2:
                lineuppercasepct = percent_upper(line)
                lastuppercasepct = percent_upper(citationlines[-2])

                if lineuppercasepct > .45 and cluescitationahead > 0 and len(
                        line) > 12 and lastuppercasepct < .45:
                    # we started a citation in error two or more lines back; this is the actual
                    # citation start
                    # notice that we check the pct uppercase of last line to make sure this isn't
                    # just a long multiline author name!

                    # discarded = citationlines[0: -1]
                    # for d in discarded:
                    #     print(d)
                    citationlines = [citationlines[-1]]

    return books, author_errors
示例#4
0
def divide_into_quotations(booklist):

    all_reviewwords, reviewdict = read_pubnames.get_names(
        'brd_pubs_indexed1920s.tsv')
    longreviewnames = set()
    for rev in reviewdict.keys():
        reviewparts = rev.split()
        if len(reviewparts) < 1:
            continue
        elif len(reviewparts[0]) > 4:
            longreviewnames.add(reviewparts[0])

    publishers = [
        'Liverlght', 'Appleton', 'Baker', 'Barnes', 'Benziger', 'Bobbs',
        "Brentano's", 'Cassell', 'Century', 'Collier-Fox', 'Crowell', 'Ditson',
        'Dodd', 'Doran', 'Doubleday', 'Dutton', 'Elder', 'Estes', 'Ginn',
        'Goodspeed', 'Harper', 'Heath', 'Holt', 'Houghton', 'Knopf', 'Lane',
        'Lippincott', 'Little', 'Liveright', 'Longmans', 'Macmillan',
        'McBride', 'McClure', 'McGraw', 'Moffat', 'Oxford', 'Page', 'Pott',
        'Putnam', 'Scribner', 'Simmons', 'Stokes', 'Walton', 'Warne',
        'Wessels', 'Wilde', 'Wiley', 'Winston', 'Yale'
    ]

    lexical_patterns = [('numeric', '.?[0-9]{1,7}.?[0-9]*[,.:=]?'), \
    ('reviewword', all_reviewwords),
    ('openparen', '\(.*'),
    ('closeparen', '.*\)'),
    ('fullstop', '.*\.'),
    ('commastop', '.*\,'),
    ('startdash', '—.*'),
    ('numeric', {'I:', 'II:', 'III:', 'IV:'}),
    ('titlecase', '[A-Z].*'),
    ('monthabbrev', {'Ja', 'F', 'Mr', 'Ap', 'My', 'Je', 'Jl', 'Ag', 'S', 'O', 'N', 'D'}),
    ('lineendingyear', '[\'"•■]\d+'),
    ('volandpgrange', '[0-9]+[:][0-9-]+'),
    ('somenumeric', '.?.?[0-9]{1,7}.?.?[0-9]*.?'),
    ('allcaps', '[A-Z\'\,]+'),
    ('dollarprice', '.*\$.?.?[0-9]{1,7}.?[0-9]*[,.:=]?'),
    ('centprice', '.?.?[0-9]{1,7}.?[0-9]*c+[,.:=]?'),
    ('hyphennumber', '[0-9]{1,3}[-—~]+[0-9]{3,7}[,.:=)]?'),
    ('openquote', '[\"\'“‘]+\S*'),
    ('plusorminus', '[\+\-\—]+'),
    ('reviewword', all_reviewwords),
    ('wordcount', '\d*0w[.]?'),
    ('OCRwordcount', '\S*Ow[.]?')
    ]

    wordcountregex = re.compile('\d*0w[.]?')
    ocrwordcountregex = re.compile('\S*Ow[.]?')

    rule_list = lexparse.patterns2rules(lexical_patterns)
    allquotes = []

    plusmisreads = {
        '-4-', '4-', '1-', '-1-', '4—', '1—', '-|-', '-l-', '-)-', '—|—',
        '-I-', '-(-', '-f'
    }

    for book in booklist:
        lines = book.reviewlines

        accumulated = []
        citationcount = 0

        addtonext = ''
        skipnext = False

        for linecount, line in enumerate(lines):

            # We keep track of linecount because there are
            # characteristic kinds of noise early on, when trailing lines
            # of a citation get treated as part of the review.

            if len(addtonext) > 0:
                line = addtonext + ' ' + line
                addtonext = ''

            if skipnext:
                skipnext = False
                continue

            tokens = line.strip().split()
            if len(tokens) < 1:
                continue

            taglist = lexparse.apply_rule_list(rule_list, tokens)

            # in the first two lines we often have fragments
            # left over from the book bibliographical entry

            if linecount <= 3:
                trailingbibline = False

                for tags in taglist.tagseq:
                    if 'hyphennumber' in tags or 'dollarprice' in tags or 'centprice' in tags:
                        trailingbibline = True

                if trailingbibline:

                    # get the existing publisher to see if it makes more sense
                    # fused with something in this trailing line

                    existingpubparts = book.publisher.split()
                    if len(existingpubparts) > 0:
                        existingpub = existingpubparts[-1].strip('-')
                    else:
                        existingpub = 'not a publisher'

                    tokenssofar = []
                    for l in accumulated:
                        tokenssofar.extend(l.strip().split())
                    tokenssofar.extend(tokens)

                    tokenssofar = [x.strip('.,[]()-') for x in tokenssofar]

                    for tok in tokenssofar:
                        if tok in publishers:
                            book.publisher = tok

                        rejoined = existingpub + tok
                        if rejoined in publishers:
                            book.publisher = book.publisher.strip('-') + tok

                    line = line + ' <endsubj>'
                    accumulated.append(line)
                    continue

            # Sometimes a book is followed by a summary that
            # is not attributed to any particular review.
            # The only way I have to identify this is,
            # that a) this is the first sequence of lines and
            # b) the next line opens with a quotation mark,
            # and there has been no other citation info provided
            # yet.

            if citationcount == 0 and len(accumulated) > 3:
                if 'openquote' in taglist.tagseq[0]:
                    sentiment = ''
                    review = 'summary'
                    cite = 'summary'
                    citationcount += 1
                    quote = Quotation(book, review, sentiment, cite,
                                      accumulated)
                    allquotes.append(quote)
                    accumulated = []
                    accumulated.append(line)
                    # this line (opening with a quote) will be part of the next quotation
                    matched = True
                    continue

            numberwords = 0
            reviewwords = 0
            plusyet = False
            totalclues = 0

            for word, tags in zip(taglist.stringseq, taglist.tagseq):
                if 'reviewword' in tags:
                    reviewwords += 1
                    totalclues += 1

                elif 'plusorminus' in tags and not plusyet:
                    reviewwords += 0.5
                    totalclues += 1
                    plusyet = True

                elif 'monthabbrev' in tags:
                    totalclues += 1

                elif 'somenumeric' in tags and not '-' in word and not ',' in word:
                    numberwords += 1
                    totalclues += 1
                    if word.endswith('w'):
                        totalclues += 1
                        reviewwords += 0.5
                    elif ':' in word:
                        totalclues += 1
                        reviewwords += 0.5
                    elif word.startswith('p'):
                        totalclues += 1
                        reviewwords += 0.5

            # fuzzy match in situations where everything is there except the review
            # because it could easily be ocr error

            if numberwords > 0 and totalclues > 2 and reviewwords < 0.9:
                firstword = taglist.stringseq[0]
                if len(firstword) > 3:
                    for longname in longreviewnames:
                        similarity = match_strings(firstword, longname)
                        if similarity > .7:
                            reviewwords += 1
                            totalclues += 1
                            break

            if numberwords > 0 and reviewwords > 0.9 and totalclues > 3:
                sentimentbits = []

                numericyet = False
                publisherbits = []
                citationbits = []

                nextwordctr = 0

                for word, tags in zip(taglist.stringseq, taglist.tagseq):

                    nextwordctr += 1

                    if not numericyet and word == '+':
                        sentimentbits.append('+')
                        continue
                    if not numericyet and word in plusmisreads:
                        # e.g. '4-' is a fairly common ocr misread for +
                        sentimentbits.append('+')
                        continue
                    if not numericyet and (word == '-' or word == '—'
                                           or word == '—-'):
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '=-' or word == '--'
                                           or word == '-—'):
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '==' or word == '=--'
                                           or word == '—-'):
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '+-' or word == '+—'
                                           or word == '+='):
                        sentimentbits.append('+')
                        sentimentbits.append('-')
                        continue
                    if not numericyet and (word == '-+' or word == "—+"
                                           or word == '=+'):
                        sentimentbits.append('-')
                        sentimentbits.append('+')
                        continue
                    if not numericyet and (word == '++-' or word == '++—'
                                           or word == "++="):
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        continue
                        sentimentbits.append('-')
                    if not numericyet and (word == '++'):
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        continue
                    if not numericyet and (word == '+++'):
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        sentimentbits.append('+')
                        continue
                    if not numericyet and nextwordctr == 1 and word == "H":
                        # this is a weird but common misread; however, it's risky
                        # enough that we should only do it in first position
                        sentimentbits.append('+')
                        sentimentbits.append('-')
                        continue

                    if 'somenumeric' in tags:
                        numericyet = True

                    if not numericyet:
                        publisherbits.append(word)
                    else:
                        citationbits.append(word)

                    if numericyet and ('wordcount' in tags or 'OCRwordcount'
                                       in tags) and (nextwordctr < len(
                                           taglist.stringseq)):
                        addtonext = ' '.join(taglist.stringseq[nextwordctr:])
                        break

                # if this line doesn't end with a word count, and the next one does?
                # probably a continuation

                if len(citationbits) > 0 and not wordcountregex.fullmatch(
                        citationbits[-1]) and not ocrwordcountregex.fullmatch(
                            citationbits[-1]):
                    if linecount < (len(lines) - 1):
                        wordsinnextline = lines[linecount + 1].strip().split()
                        if len(wordsinnextline) > 0 and len(
                                wordsinnextline
                        ) < 3 and wordcountregex.fullmatch(
                                wordsinnextline[-1]):
                            citationbits.extend(wordsinnextline)
                            skipnext = True

                sentiment = ' '.join(sentimentbits)
                review = ' '.join(publisherbits)
                cite = ' '.join(citationbits)
                citationcount += 1

                quote = Quotation(book, review, sentiment, cite, accumulated)
                allquotes.append(quote)
                accumulated = []

            else:
                # odds of review 1 or less
                accumulated.append(line)

    return allquotes
示例#5
0
    for cite in citation_list:
        parse_citation(cite, startinitial, endinitial)

    return citation_list


## MAIN

paths = glob.glob('../pooles/poolesclean*txt')

citations = []
start_headword = ''
end_headword = ''

rule_list = lexparse.patterns2rules(lexical_patterns)

for p in paths:
    with open(p, encoding='utf-8') as f:
        filelines = f.readlines()
        page = []
        for fl in filelines:
            fl = fl.replace('.—', '. —')
            fl = fl.replace('—', '— ')
            fl = fl.replace(',', ', ')
            # The point of doing that is to make
            # sure citations get cleanly separated.
            # They often begin with a dash.

            tokens = fl.strip().split()
            if len(tokens) < 1 and len(page) > 0: