def get_citations(page, rule_list, start_headword, end_headword): print() print('Next page.') tagged_page = lexparse.apply_rule_list(rule_list, page) citation_list = divide_into_citations(tagged_page) if len(start_headword) < 1: startinitial = 'a' else: startinitial = start_headword[0].lower() if len(end_headword) < 1: endinitial = 'z' else: endinitial = end_headword[0].lower() for cite in citation_list: parse_citation(cite, startinitial, endinitial) return citation_list
def divide_into_quotations(booklist): all_reviewwords, reviewdict = read_pubnames.get_names( 'brd_pubs_indexed1920s.tsv') reviewnames = set(reviewdict.keys()) lexical_patterns = [('numeric', '.?[0-9]{1,7}.?[0-9]*[,.:=]?'), \ ('reviewword', all_reviewwords), ('openparen', '\(.*'), ('closeparen', '.*\)'), ('fullstop', '.*\.'), ('commastop', '.*\,'), ('startdash', '—.*'), ('numeric', {'I:', 'II:', 'III:', 'IV:'}), ('titlecase', '[A-Z].*'), ('monthabbrev', {'Ja', 'F', 'Mr', 'Ap', 'My', 'Je', 'Jl', 'Ag', 'S', 'O', 'N', 'D'}), ('lineendingyear', '[\'"•■]\d+'), ('volandpgrange', '[0-9]+[:][0-9-]+'), ('somenumeric', '.?.?[0-9]{1,7}.?.?[0-9]*.?'), ('allcaps', '[A-Z\'\,]+'), ('dollarprice', '.*\$.?.?[0-9]{1,7}.?[0-9]*[,.:=]?'), ('centprice', '.?.?[0-9]{1,7}.?[0-9]*c+[,.:=]?'), ('hyphennumber', '[0-9]{1,3}[-—~]+[0-9]{3,7}[,.:=)]?'), ('openquote', '[\"\'“‘]+\S*'), ('plusorminus', '[\+\-\—]+'), ('reviewword', all_reviewwords), ('wordcount', '\d*0w[.]?') ] wordcountregex = re.compile('\d*0w[.]?') rule_list = lexparse.patterns2rules(lexical_patterns) allquotes = [] plusmisreads = { '-4-', '4-', '1-', '-1-', '4—', '1—', '-|-', '-l-', '-)-', '—|—', '-I-', '-(-', '-f' } for book in booklist: lines = book.reviewlines accumulated = [] citationcount = 0 addtonext = '' skipnext = False for linecount, line in enumerate(lines): # We keep track of linecount because there are # characteristic kinds of noise early on, when trailing lines # of a citation get treated as part of the review. if len(addtonext) > 0: line = addtonext + ' ' + line addtonext = '' if skipnext: skipnext = False continue tokens = line.strip().split() if len(tokens) < 1: continue taglist = lexparse.apply_rule_list(rule_list, tokens) # in the first two lines we often have fragments # left over from the book bibliographical entry if linecount <= 3: trailingbibline = False for tags in taglist.tagseq: if 'hyphennumber' in tags or 'dollarprice' in tags or 'centprice' in tags: trailingbibline = True if trailingbibline: line = line + ' <endsubj>' accumulated.append(line) continue # Sometimes a book is followed by a summary that # is not attributed to any particular review. # The only way I have to identify this is, # that a) this is the first sequence of lines and # b) the next line opens with a quotation mark, # and there has been no other citation info provided # yet. if citationcount == 0 and len(accumulated) > 3: if 'openquote' in taglist.tagseq[0]: sentiment = '' review = 'summary' cite = 'summary' citationcount += 1 quote = Quotation(book, review, sentiment, cite, accumulated) allquotes.append(quote) accumulated = [] accumulated.append(line) # this line (opening with a quote) will be part of the next quotation matched = True continue oddsofreview = 0 reviewwordyet = False for word, tags in zip(taglist.stringseq, taglist.tagseq): if 'reviewword' in tags and not reviewwordyet: oddsofreview += 1 reviewwordyet = True if 'plusorminus' in tags: oddsofreview += 1 if 'somenumeric' in tags and not '-' in word and not ',' in word: oddsofreview += 1 if (oddsofreview > 1 and linecount > 1) or oddsofreview > 2: sentimentbits = [] numericyet = False publisherbits = [] citationbits = [] nextwordctr = 0 for word, tags in zip(taglist.stringseq, taglist.tagseq): nextwordctr += 1 if not numericyet and word == '+': sentimentbits.append('+') continue if not numericyet and word in plusmisreads: # e.g. '4-' is a fairly common ocr misread for + sentimentbits.append('+') continue if not numericyet and (word == '-' or word == '—' or word == '—-'): sentimentbits.append('-') continue if not numericyet and (word == '=-' or word == '--' or word == '-—'): sentimentbits.append('-') continue if not numericyet and (word == '==' or word == '=--' or word == '—-'): sentimentbits.append('-') continue if not numericyet and (word == '+-' or word == '+—' or word == '+='): sentimentbits.append('+') sentimentbits.append('-') continue if not numericyet and (word == '-+' or word == "—+" or word == '=+'): sentimentbits.append('-') sentimentbits.append('+') continue if not numericyet and (word == '++-' or word == '++—' or word == "++="): sentimentbits.append('+') sentimentbits.append('+') continue sentimentbits.append('-') if not numericyet and (word == '++'): sentimentbits.append('+') sentimentbits.append('+') continue if not numericyet and (word == '+++'): sentimentbits.append('+') sentimentbits.append('+') sentimentbits.append('+') continue if not numericyet and nextwordctr == 1 and word == "H": # this is a weird but common misread; however, it's risky # enough that we should only do it in first position sentimentbits.append('+') sentimentbits.append('-') continue if 'somenumeric' in tags: numericyet = True if not numericyet: publisherbits.append(word) else: citationbits.append(word) if numericyet and 'wordcount' in tags and ( nextwordctr < len(taglist.stringseq)): addtonext = ' '.join(taglist.stringseq[nextwordctr:]) break # if this line doesn't end with a word count, and the next one does? # probably a continuation if len(citationbits) > 0 and not wordcountregex.fullmatch( citationbits[-1]): if linecount < (len(lines) - 1): wordsinnextline = lines[linecount + 1].strip().split() if len(wordsinnextline ) > 0 and wordcountregex.fullmatch( wordsinnextline[-1]): citationbits.extend(wordsinnextline) skipnext = True sentiment = ' '.join(sentimentbits) review = ' '.join(publisherbits) cite = ' '.join(citationbits) citationcount += 1 quote = Quotation(book, review, sentiment, cite, accumulated) allquotes.append(quote) accumulated = [] else: # odds of review 1 or less accumulated.append(line) return allquotes
def get_books(pagelist, publishers): lexical_patterns = [('numeric', '.?[0-9]{1,7}.?[0-9]*[,.:=]?'), \ ('genreword', {'reviewed', 'by', 'review.', 'review', 'by,', 'article.', 'article', 'article-non', 'lit.', 'non-lit.', 'poem.', 'fict.', 'fiction', 'fict', 'fiction.', 'poem'}), ('openparen', '\(.*'), ('closeparen', '.*\)'), ('fullstop', '\S+[\.\?!]'), ('commastop', '.*\,'), ('startdash', '—.*'), ('numeric', {'I:', 'II:', 'III:', 'IV:'}), ('titlecase', '[A-Z].*'), ('monthabbrev', {'Ja', 'F', 'Mr', 'Ap', 'My', 'Je', 'Jl', 'Ag', 'S', 'O', 'N', 'D'}), ('lineendingyear', '[\'"•■]\d+'), ('volandpgrange', '[0-9]+[:][0-9-]+'), ('somenumeric', '.?.?.?[0-9]{1,7}.?.?[0-9]*.?'), ('allcaps', '[[A-Z\'\,\‘\.\-:;]*[A-Z]{2,}[A-Z\'\,\‘\.\:;]*'), ('dollarprice', '.{0,2}[$\"\'\“].?.?[0-9]{1,7}.?[0-9]*[,.:=]?'), ('centprice', '.?.?[0-9]{1,7}.?[0-9]*c+[,.:=]?'), ('hyphennumber', '.?[0-9]{1,2}[-—~]+[0-9]{3,7}.?'), ('openquote', '[\"\'“‘]+\S*'), ('deweydecimal', '[0-9]{3}[.][0-9-]+'), ('numpages', '\d{2,5}p') ] rule_list = lexparse.patterns2rules(lexical_patterns) # Our strategy is to find pairs of lines that bookend a citation. A # citation and the lines that follow it (before the next citation) # is going to count as an Author. # A citation starts with a line whose first word is either capitalized and # followed by a comma, or all uppercase and longer than three characters. # It should be alphabetically later than the last author-name. Exceptions # are flagged. # A citation ends with a line (up to five lines later) that contains # $ + number or number + c, or number containing a hyphen, or that ends # with a publisher. # We create a list of author_names as well as a list of Author objects. author_errors = [] books = [] # We keep track of the last author name we've seen; it's going to be # the name that governs the lines we are currently examining. last_author_name = '' textpage = 1 reviewlines = [] citation_started = False citation_finished = False citationlines = [] governing_citation = Citation(['Anonymous. My book. $1.00. Macmillan.'], rule_list, textpage) aligned = 0 for pagenum, page in enumerate(pagelist): for linenum, line in enumerate(page): line = line.strip() this_line_is_new_citation = False # Line numbers are only relevant in guiding us to ignore the running header, # and to update the page number. This will be imperfect, because OCR, # but that's okay. If we get four successive pages that increment each # other as they should (10, 11, 12, 13, ...), we say that enough have been aligned to # override checking and just keep adding one per page. if aligned >= 4 and linenum == 0: textpage += 1 if linenum < 4: thematch = match_strings('BOOK REVIEW DIGEST', line) if thematch > 0.8 and len(line) > 7: wordsinline = line.split() if len(wordsinline) > 3 and wordsinline[0].isdigit(): pagenum = int(wordsinline[0]) elif len(wordsinline) > 3 and wordsinline[-1].isdigit(): pagenum = int(wordsinline[-1]) if textpage + 1 == pagenum: aligned += 1 textpage = pagenum elif pagenum < 20: textpage = pagenum continue # skip this line if len(line) > 15: thetail = line[-11:] thematch = match_strings('—Continued.', thetail) if thematch > 0.75: continue if line.isdigit() and aligned < 4: try: newtextpage = int(line) if textpage + 1 == newtextpage: aligned += 1 else: aligned = 0 textpage = newtextpage except: textpage += 1 elif len(line) > 12 and line[-2:].isdigit() and aligned < 4: words = line.split() if words[-1].isdigit(): pagenumpart = words[-1] try: newtextpage = int(pagenumpart) if textpage + 1 == newtextpage: aligned += 1 else: aligned = 0 textpage = newtextpage except: textpage += 1 elif len(line) > 12 and line[0:2].isdigit() and aligned < 4: words = line.split() if words[0].isdigit(): pagenumpart = words[0] try: newtextpage = int(pagenumpart) if textpage + 1 == newtextpage: aligned += 1 else: aligned = 0 textpage = newtextpage except: textpage += 1 if line.startswith('Figures in parenth'): continue tokens = line.split() if len(tokens) < 1: continue # There are things that look like the beginning of a citation but # are actually cross-references if "See" in line: skepticalofcitestart = True else: skepticalofcitestart = False nextline = linenum + 1 if nextline < len(page) and "See" in page[nextline]: skepticalofcitestart = True cannotcitestart = False if skepticalofcitestart and linenum + 5 < len(page): for lookforward in range(1, 5): futureline = page[linenum + lookforward] if percent_upper(futureline) > .3: cannotcitestart = True taglist = lexparse.apply_rule_list(rule_list, tokens) firstword = taglist.stringseq[0] firsttagset = taglist.tagseq[0] cluescitationahead = 0 distancetolook = 6 if (len(page) - linenum) < distancetolook: distancetolook = len(page) - linenum if 'allcaps' in firsttagset and len( firstword) > 2 and distancetolook > 1: for lookforward in range(1, distancetolook): futureline = page[linenum + lookforward] if '$' in futureline or "ed." in futureline: cluescitationahead += 1 if cluescitationahead > 0: futuretokens = futureline.split() for t in futuretokens: if hyphenregex.fullmatch(t): cluescitationahead += 1 if not citation_started and not cannotcitestart: allcapcount = 0 for tags in taglist.tagseq: if 'allcaps' in tags: allcapcount += 1 lineuppercasepct = percent_upper(line) if (allcapcount > 0 and lineuppercasepct > 0.1 and len(line) > 9) or ( lineuppercasepct > 0.6 and len(line) > 9) or (allcapcount > 0 and cluescitationahead > 0 and len(line) > 9): percentageupper = percent_upper(firstword) if len(line) > 15: pctupin15 = percent_upper(line[0:15]) else: pctupin15 = 0 if 'allcaps' in firsttagset and len( firstword) > 2 and cluescitationahead > 0: this_line_is_new_citation = True elif lineuppercasepct > 0.72 and len(line) > 14: this_line_is_new_citation = True elif pctupin15 > .35 and ('$' in line or cluescitationahead > 1) and len(reviewlines) > 3: this_line_is_new_citation = True elif percentageupper > 0.7 and len( firstword) > 4 and allcapcount > 2: this_line_is_new_citation = True elif pctupin15 > 0.65: this_line_is_new_citation = True else: reviewlines.append(line) else: reviewlines.append(line) if this_line_is_new_citation: # a new citation has begun citation_finished = False citationlines = [] for string, tags in zip(taglist.stringseq, taglist.tagseq): if 'dollarprice' in tags or 'centprice' in tags: # note that our conditions for ending a citation with the first # line are more stringent than they will be from the second onward citation_finished = True break elif not citation_started and cannotcitestart: reviewlines.append(line) else: # if a citation has been started, let's see if we should end it for string, tags in zip(taglist.stringseq, taglist.tagseq): if 'dollarprice' in tags or 'centprice' in tags or 'hyphennumber' in tags or 'deweydecimal' in tags: # more conditions can end a citation now citation_finished = True break if len(taglist.stringseq) > 1 and taglist.stringseq[-1].strip( '.') in publishers: # sometimes there's no price and the publisher's name is the only clue # that the citation is finished citation_finished = True if len(citationlines) > 2 and len( taglist.tagseq ) > 1 and 'somenumeric' in taglist.tagseq[0]: try: deweydecimal = float(taglist.stringseq[0]) if deweydecimal > 99: citation_finished = True except: pass if this_line_is_new_citation or citation_started: citationlines.append(line) citation_started = True this_line_is_new_citation = False if citation_finished: # we have concluded a new citation # first, make the last citation into a book: thisbook = Book(governing_citation, reviewlines) books.append(thisbook) # initialize reviewlines, and create a new citation reviewlines = [] citation_started = False citation_finished = False # we finished that citation, started a new one governing_citation = Citation(citationlines, rule_list, textpage) citationlines = [] new_author = governing_citation.author if new_author < last_author_name: author_errors.append( (textpage, last_author_name, new_author)) last_author_name = new_author elif len(citationlines) > 8: # this is too many lines, and we were probably in error to have # started the citation, so put those lines back in reviewlines. # This is esp. likely to happen at the top of a page, when # an entry is "continued." reviewlines.extend(citationlines) citationlines = [] citation_started = False elif len(citationlines) > 2: lineuppercasepct = percent_upper(line) lastuppercasepct = percent_upper(citationlines[-2]) if lineuppercasepct > .45 and cluescitationahead > 0 and len( line) > 12 and lastuppercasepct < .45: # we started a citation in error two or more lines back; this is the actual # citation start # notice that we check the pct uppercase of last line to make sure this isn't # just a long multiline author name! # discarded = citationlines[0: -1] # for d in discarded: # print(d) citationlines = [citationlines[-1]] return books, author_errors
def __init__(self, linelist, rule_list, textpage): self.pagenum = textpage alltuples = [] for line in linelist: tokens = line.strip().split() if len(tokens) < 1: continue taglist = lexparse.apply_rule_list(rule_list, tokens) for astring, tags in zip(taglist.stringseq, taglist.tagseq): alltuples.append((astring, tags)) titlestart = False titledone = False authordone = False authorstop = False dollarpricefound = False # The underlying logic here is elegant. We take words up to the first # full stop as the "author." From there to the next full stop is # the "title." Except, well, in cases of initials. # Since "Adams, B. V." has two periods, the rule is that we need # a non-fullstopped word or a word of more than three characters to trigger # 'title." # To implement that we need two flags: # authorstop -- we have reached a full stop # authordone -- we also hit a subsequent word that lacks a period # or is more than three chars long title = [] author = [] price = 0 publisher = [] tokenssincenumpages = 3 for word, tags in alltuples: tokenssincenumpages += 1 if authorstop and not authordone: if word.startswith('eds.') or word.startswith('pseud.'): author.append(word) elif len(word) > 1 and numcaps(word) / len(word) < 1: authordone = True if word[0].isupper(): titlestart = True title.append(word) else: author.append(word) elif not authordone: author.append(word) if 'fullstop' in tags: authorstop = True elif len(word) > 1 and numcaps(word) / len(word) < 0.6: authorstop = True elif not titledone: if word[0].isupper(): titlestart = True if titlestart and 'fullstop' in tags: titledone = True if titlestart and 'dollarprice' in tags and not 'numpages' in tags: price = pricetranslate(word) if '$' in word: dollarpricefound = True titledone = True if titlestart and 'numpages' in tags: titledone = True publisher.append(word) tokenssincenumpages = 0 else: title.append(word) else: if titlestart and 'numpages' in tags: publisher.append(word) tokenssincenumpages = 0 elif tokenssincenumpages < 3 and 'somenumeric' in tags: if '$' in word: dollarpricefound = True price = aggressivepricetranslate(word) elif 'dollarprice' in tags: tryprice = pricetranslate(word) if not dollarpricefound: price = tryprice if '$' in word: dollarpricefound = True elif 'centprice' in tags and not dollarpricefound: price = pricetranslate(word) elif word.strip( "'‘’*t") in valid_prices and not dollarpricefound: price = valid_prices[word.strip("'‘’*t")] else: publisher.append(word) self.author = ' '.join(author) self.title = ' '.join(title) self.publisher = ' '.join(publisher) self.price = price
def divide_into_quotations(booklist): all_reviewwords, reviewdict = read_pubnames.get_names( 'brd_pubs_indexed1920s.tsv') longreviewnames = set() for rev in reviewdict.keys(): reviewparts = rev.split() if len(reviewparts) < 1: continue elif len(reviewparts[0]) > 4: longreviewnames.add(reviewparts[0]) publishers = [ 'Liverlght', 'Appleton', 'Baker', 'Barnes', 'Benziger', 'Bobbs', "Brentano's", 'Cassell', 'Century', 'Collier-Fox', 'Crowell', 'Ditson', 'Dodd', 'Doran', 'Doubleday', 'Dutton', 'Elder', 'Estes', 'Ginn', 'Goodspeed', 'Harper', 'Heath', 'Holt', 'Houghton', 'Knopf', 'Lane', 'Lippincott', 'Little', 'Liveright', 'Longmans', 'Macmillan', 'McBride', 'McClure', 'McGraw', 'Moffat', 'Oxford', 'Page', 'Pott', 'Putnam', 'Scribner', 'Simmons', 'Stokes', 'Walton', 'Warne', 'Wessels', 'Wilde', 'Wiley', 'Winston', 'Yale' ] lexical_patterns = [('numeric', '.?[0-9]{1,7}.?[0-9]*[,.:=]?'), \ ('reviewword', all_reviewwords), ('openparen', '\(.*'), ('closeparen', '.*\)'), ('fullstop', '.*\.'), ('commastop', '.*\,'), ('startdash', '—.*'), ('numeric', {'I:', 'II:', 'III:', 'IV:'}), ('titlecase', '[A-Z].*'), ('monthabbrev', {'Ja', 'F', 'Mr', 'Ap', 'My', 'Je', 'Jl', 'Ag', 'S', 'O', 'N', 'D'}), ('lineendingyear', '[\'"•■]\d+'), ('volandpgrange', '[0-9]+[:][0-9-]+'), ('somenumeric', '.?.?[0-9]{1,7}.?.?[0-9]*.?'), ('allcaps', '[A-Z\'\,]+'), ('dollarprice', '.*\$.?.?[0-9]{1,7}.?[0-9]*[,.:=]?'), ('centprice', '.?.?[0-9]{1,7}.?[0-9]*c+[,.:=]?'), ('hyphennumber', '[0-9]{1,3}[-—~]+[0-9]{3,7}[,.:=)]?'), ('openquote', '[\"\'“‘]+\S*'), ('plusorminus', '[\+\-\—]+'), ('reviewword', all_reviewwords), ('wordcount', '\d*0w[.]?'), ('OCRwordcount', '\S*Ow[.]?') ] wordcountregex = re.compile('\d*0w[.]?') ocrwordcountregex = re.compile('\S*Ow[.]?') rule_list = lexparse.patterns2rules(lexical_patterns) allquotes = [] plusmisreads = { '-4-', '4-', '1-', '-1-', '4—', '1—', '-|-', '-l-', '-)-', '—|—', '-I-', '-(-', '-f' } for book in booklist: lines = book.reviewlines accumulated = [] citationcount = 0 addtonext = '' skipnext = False for linecount, line in enumerate(lines): # We keep track of linecount because there are # characteristic kinds of noise early on, when trailing lines # of a citation get treated as part of the review. if len(addtonext) > 0: line = addtonext + ' ' + line addtonext = '' if skipnext: skipnext = False continue tokens = line.strip().split() if len(tokens) < 1: continue taglist = lexparse.apply_rule_list(rule_list, tokens) # in the first two lines we often have fragments # left over from the book bibliographical entry if linecount <= 3: trailingbibline = False for tags in taglist.tagseq: if 'hyphennumber' in tags or 'dollarprice' in tags or 'centprice' in tags: trailingbibline = True if trailingbibline: # get the existing publisher to see if it makes more sense # fused with something in this trailing line existingpubparts = book.publisher.split() if len(existingpubparts) > 0: existingpub = existingpubparts[-1].strip('-') else: existingpub = 'not a publisher' tokenssofar = [] for l in accumulated: tokenssofar.extend(l.strip().split()) tokenssofar.extend(tokens) tokenssofar = [x.strip('.,[]()-') for x in tokenssofar] for tok in tokenssofar: if tok in publishers: book.publisher = tok rejoined = existingpub + tok if rejoined in publishers: book.publisher = book.publisher.strip('-') + tok line = line + ' <endsubj>' accumulated.append(line) continue # Sometimes a book is followed by a summary that # is not attributed to any particular review. # The only way I have to identify this is, # that a) this is the first sequence of lines and # b) the next line opens with a quotation mark, # and there has been no other citation info provided # yet. if citationcount == 0 and len(accumulated) > 3: if 'openquote' in taglist.tagseq[0]: sentiment = '' review = 'summary' cite = 'summary' citationcount += 1 quote = Quotation(book, review, sentiment, cite, accumulated) allquotes.append(quote) accumulated = [] accumulated.append(line) # this line (opening with a quote) will be part of the next quotation matched = True continue numberwords = 0 reviewwords = 0 plusyet = False totalclues = 0 for word, tags in zip(taglist.stringseq, taglist.tagseq): if 'reviewword' in tags: reviewwords += 1 totalclues += 1 elif 'plusorminus' in tags and not plusyet: reviewwords += 0.5 totalclues += 1 plusyet = True elif 'monthabbrev' in tags: totalclues += 1 elif 'somenumeric' in tags and not '-' in word and not ',' in word: numberwords += 1 totalclues += 1 if word.endswith('w'): totalclues += 1 reviewwords += 0.5 elif ':' in word: totalclues += 1 reviewwords += 0.5 elif word.startswith('p'): totalclues += 1 reviewwords += 0.5 # fuzzy match in situations where everything is there except the review # because it could easily be ocr error if numberwords > 0 and totalclues > 2 and reviewwords < 0.9: firstword = taglist.stringseq[0] if len(firstword) > 3: for longname in longreviewnames: similarity = match_strings(firstword, longname) if similarity > .7: reviewwords += 1 totalclues += 1 break if numberwords > 0 and reviewwords > 0.9 and totalclues > 3: sentimentbits = [] numericyet = False publisherbits = [] citationbits = [] nextwordctr = 0 for word, tags in zip(taglist.stringseq, taglist.tagseq): nextwordctr += 1 if not numericyet and word == '+': sentimentbits.append('+') continue if not numericyet and word in plusmisreads: # e.g. '4-' is a fairly common ocr misread for + sentimentbits.append('+') continue if not numericyet and (word == '-' or word == '—' or word == '—-'): sentimentbits.append('-') continue if not numericyet and (word == '=-' or word == '--' or word == '-—'): sentimentbits.append('-') continue if not numericyet and (word == '==' or word == '=--' or word == '—-'): sentimentbits.append('-') continue if not numericyet and (word == '+-' or word == '+—' or word == '+='): sentimentbits.append('+') sentimentbits.append('-') continue if not numericyet and (word == '-+' or word == "—+" or word == '=+'): sentimentbits.append('-') sentimentbits.append('+') continue if not numericyet and (word == '++-' or word == '++—' or word == "++="): sentimentbits.append('+') sentimentbits.append('+') continue sentimentbits.append('-') if not numericyet and (word == '++'): sentimentbits.append('+') sentimentbits.append('+') continue if not numericyet and (word == '+++'): sentimentbits.append('+') sentimentbits.append('+') sentimentbits.append('+') continue if not numericyet and nextwordctr == 1 and word == "H": # this is a weird but common misread; however, it's risky # enough that we should only do it in first position sentimentbits.append('+') sentimentbits.append('-') continue if 'somenumeric' in tags: numericyet = True if not numericyet: publisherbits.append(word) else: citationbits.append(word) if numericyet and ('wordcount' in tags or 'OCRwordcount' in tags) and (nextwordctr < len( taglist.stringseq)): addtonext = ' '.join(taglist.stringseq[nextwordctr:]) break # if this line doesn't end with a word count, and the next one does? # probably a continuation if len(citationbits) > 0 and not wordcountregex.fullmatch( citationbits[-1]) and not ocrwordcountregex.fullmatch( citationbits[-1]): if linecount < (len(lines) - 1): wordsinnextline = lines[linecount + 1].strip().split() if len(wordsinnextline) > 0 and len( wordsinnextline ) < 3 and wordcountregex.fullmatch( wordsinnextline[-1]): citationbits.extend(wordsinnextline) skipnext = True sentiment = ' '.join(sentimentbits) review = ' '.join(publisherbits) cite = ' '.join(citationbits) citationcount += 1 quote = Quotation(book, review, sentiment, cite, accumulated) allquotes.append(quote) accumulated = [] else: # odds of review 1 or less accumulated.append(line) return allquotes
def subdivide_author(auth, rule_list): ''' This function accepts an Author, which contains a list of lines paired with group_tags to identify "WORKS BY" or "WORKS ABOUT." *Every* line in a WORKS ABOUT section is expected to bear the tag "about." First this function turns the lines into TaggedLists (a class from the lexparse module.) Then it iterates through tuples of the form (TaggedList, group_tag) and aggregates them into Citations. ''' author_name = auth.name print(author_name) rawlines = auth.get_lines() last_tag = 'match-any' citation_list = [] tokenchunk = [] tagged_group = 'not yet used' for line, group_tag in rawlines: tokens = line.strip().split() if len(tokens) < 1: continue # what to do with the tokens depends on # where we are in a tag sequence if last_tag == 'match-any': # turn them into a new TaggedList (the first for this author) tagged_group = lexparse.apply_rule_list(rule_list, tokens) last_tag = group_tag elif group_tag == last_tag: # group_tag remains the same (WORKS ABOUT or WORKS BY) # so just extend the group new_group = lexparse.apply_rule_list(rule_list, tokens) tagged_group.extend(new_group) else: # there has been a shift of tag so let's divide the # groups new_citations = divide_into_citations(tagged_group, last_tag, author_name) citation_list.extend(new_citations) # and create a new tagged_group tagged_group = lexparse.apply_rule_list(rule_list, tokens) # and make this tag into the last_tag last_tag = group_tag # when we're done, since there's not a next tag to trigger the # division of tagged_group, we have to do it explicitly if type(tagged_group) != str: # slightly cheesy way of confirming that it's a TaggedList new_citations = divide_into_citations(tagged_group, last_tag, author_name) citation_list.extend(new_citations) return citation_list
def parse_pages(pagefiles, rule_list, prev_entry = 'A'): for p in pagefiles: ## first read in a page and tag all the strings in each line with open(p, encoding = 'utf-8') as f: filelines = f.readlines() tagged_page = [] for fl in filelines: tokens = fl.strip().split() if len(tokens) < 1: continue else: tagged_line = lexparse.apply_rule_list(rule_list, tokens) tagged_page.append(tagged_line) ## now identify the lines themselves as entries or as citations linetuples = [] for index, line in enumerate(tagged_page): if index < 2 and is_header(line): linetag = 'header' elif is_citation_end(line): linetag = 'citationend' elif has_citation(line): linetag = 'citationpart' elif is_entry(line): linetag = 'entry' elif is_all_text(line): linetag = 'alltext' else: linetag = 'ambiguous' linetuples.append((linetag, line)) # Now we organize lines into groups that share an entry. # First, use alphabetical sequence to confirm entries. We're going create # a list of lines that we think are entries. entrypoints = [] # We're going to rely on the variable prev_entry, inerited from # the previous page, to make sure # that entries are in alphabetical order. But we also need a # way to correct errors if we get off. lowerthanprev = 0 allentries = 1 # note a bit of additive smoothing firstonpage = 'Aa' for ltuple in linetuples: tag, line = ltuple if tag == 'entry': firstword = line.stringseq[0] if firstonpage == 'Aa': firstonpage = firstword # 'Aa' is just a flag that we haven't taken the firstonpage yet allentries += 1 if firstword < prev_entry: lowerthanprev += 1 if (lowerthanprev / allentries) > 0.5: prev_entry = firstonpage # If more than half the entries on the page begin with a word # alphabetically earlier than prev_entry, we have gotten out of # order, and the prev_entry needs to be reset to the first on page. for idx, ltuple in enumerate(linetuples): tag, line = ltuple firstword = line.stringseq[0] if tag == 'entry' and firstword >= prev_entry: entrypoints.append(idx) prev_entry = firstword elif tag == 'alltext' and line.stringseq[0] >= prev_entry: for idx2 in range(idx +1, len(linetuples)): tag2, line2 = linetuples[idx2] if tag2 == 'entry' and line2.stringseq[0] >= firstword: entrypoints.append(idx) prev_entry = firstword break elif tag2 == 'entry': break else: continue # okay, now we have a list of lines that we think are entries. # we can use that to create chunks of lines that share the same # entry chunks = [] for idx, entrypoint in enumerate(entrypoints): linenum = entrypoint if idx + 1 >= len(entrypoints): chunks.append((entrypoint, len(linetuples))) else: for idx2 in range(idx + 1, len(entrypoints)): linenum2 = entrypoints[idx2] if linenum2 = linenum + 1: # sequential entries should be kept together linenum = linenum2 else: # aha, a break chunks.append((entrypoint, linenum2)) break citations = [] for chunktuple in chunks: startline, stopline = chunktuple new_chunk = linetuples[startline: stopline] new_citations = parse_chunk(new_chunk) citations.extend(new_citations) current_header = '' for cite in citations: main_subject = cite.get_subject('main') if main_subject.startswith('<head'): cite.set_subject('main', current_header) else: current_header = main_subject outpath = p.replace('clean', 'results').replace('.txt', '.tsv') columns = ['mainhed', 'subhed', 'author', 'journalname', 'volandpg', 'fullcite'] with open(outpath, mode = 'w', encoding = 'utf-8') as f: writer = csv.DictWriter(f, delimiter = '\t', fieldnames = columns) writer.writeheader() for cite in citations: outrow = cite.get_outrow() writer.writerow(outrow)