def authors_to_citations(author_list): lexical_patterns = [('numeric', '.?[0-9]{1,7}.?[0-9]*[,.:=]?'), \ ('genreword', {'reviewed', 'by', 'review.', 'review', 'by,', 'article.', 'article', 'article-non', 'lit.', 'non-lit.', 'poem.', 'fict.', 'fiction', 'fict', 'fiction.', 'poem'}), ('openparen', '\(.*'), ('closeparen', '.*\)'), ('fullstop', '.*\.'), ('commastop', '.*\,'), ('startdash', '—.*'), ('numeric', {'I:', 'II:', 'III:', 'IV:'}), ('titlecase', '[A-Z].*'), ('monthabbrev', {'Ja', 'F', 'Mr', 'Ap', 'My', 'Je', 'Jl', 'Ag', 'S', 'O', 'N', 'D'}), ('lineendingyear', '[\'"•■]\d+'), ('volandpgrange', '[0-9]+[:][0-9-]+'), ('somenumeric', '.?.?[0-9]{1,7}.?.?[0-9]*.?'), ('allcaps', '[A-Z\']+') ] rule_list = lexparse.patterns2rules(lexical_patterns) all_parsed_citations = [] for auth in author_list: citation_list = subdivide_author(auth, rule_list) parse_parts(citation_list) all_parsed_citations.extend(citation_list) return all_parsed_citations
def divide_into_quotations(booklist): all_reviewwords, reviewdict = read_pubnames.get_names( 'brd_pubs_indexed1920s.tsv') reviewnames = set(reviewdict.keys()) lexical_patterns = [('numeric', '.?[0-9]{1,7}.?[0-9]*[,.:=]?'), \ ('reviewword', all_reviewwords), ('openparen', '\(.*'), ('closeparen', '.*\)'), ('fullstop', '.*\.'), ('commastop', '.*\,'), ('startdash', '—.*'), ('numeric', {'I:', 'II:', 'III:', 'IV:'}), ('titlecase', '[A-Z].*'), ('monthabbrev', {'Ja', 'F', 'Mr', 'Ap', 'My', 'Je', 'Jl', 'Ag', 'S', 'O', 'N', 'D'}), ('lineendingyear', '[\'"•■]\d+'), ('volandpgrange', '[0-9]+[:][0-9-]+'), ('somenumeric', '.?.?[0-9]{1,7}.?.?[0-9]*.?'), ('allcaps', '[A-Z\'\,]+'), ('dollarprice', '.*\$.?.?[0-9]{1,7}.?[0-9]*[,.:=]?'), ('centprice', '.?.?[0-9]{1,7}.?[0-9]*c+[,.:=]?'), ('hyphennumber', '[0-9]{1,3}[-—~]+[0-9]{3,7}[,.:=)]?'), ('openquote', '[\"\'“‘]+\S*'), ('plusorminus', '[\+\-\—]+'), ('reviewword', all_reviewwords), ('wordcount', '\d*0w[.]?') ] wordcountregex = re.compile('\d*0w[.]?') rule_list = lexparse.patterns2rules(lexical_patterns) allquotes = [] plusmisreads = { '-4-', '4-', '1-', '-1-', '4—', '1—', '-|-', '-l-', '-)-', '—|—', '-I-', '-(-', '-f' } for book in booklist: lines = book.reviewlines accumulated = [] citationcount = 0 addtonext = '' skipnext = False for linecount, line in enumerate(lines): # We keep track of linecount because there are # characteristic kinds of noise early on, when trailing lines # of a citation get treated as part of the review. if len(addtonext) > 0: line = addtonext + ' ' + line addtonext = '' if skipnext: skipnext = False continue tokens = line.strip().split() if len(tokens) < 1: continue taglist = lexparse.apply_rule_list(rule_list, tokens) # in the first two lines we often have fragments # left over from the book bibliographical entry if linecount <= 3: trailingbibline = False for tags in taglist.tagseq: if 'hyphennumber' in tags or 'dollarprice' in tags or 'centprice' in tags: trailingbibline = True if trailingbibline: line = line + ' <endsubj>' accumulated.append(line) continue # Sometimes a book is followed by a summary that # is not attributed to any particular review. # The only way I have to identify this is, # that a) this is the first sequence of lines and # b) the next line opens with a quotation mark, # and there has been no other citation info provided # yet. if citationcount == 0 and len(accumulated) > 3: if 'openquote' in taglist.tagseq[0]: sentiment = '' review = 'summary' cite = 'summary' citationcount += 1 quote = Quotation(book, review, sentiment, cite, accumulated) allquotes.append(quote) accumulated = [] accumulated.append(line) # this line (opening with a quote) will be part of the next quotation matched = True continue oddsofreview = 0 reviewwordyet = False for word, tags in zip(taglist.stringseq, taglist.tagseq): if 'reviewword' in tags and not reviewwordyet: oddsofreview += 1 reviewwordyet = True if 'plusorminus' in tags: oddsofreview += 1 if 'somenumeric' in tags and not '-' in word and not ',' in word: oddsofreview += 1 if (oddsofreview > 1 and linecount > 1) or oddsofreview > 2: sentimentbits = [] numericyet = False publisherbits = [] citationbits = [] nextwordctr = 0 for word, tags in zip(taglist.stringseq, taglist.tagseq): nextwordctr += 1 if not numericyet and word == '+': sentimentbits.append('+') continue if not numericyet and word in plusmisreads: # e.g. '4-' is a fairly common ocr misread for + sentimentbits.append('+') continue if not numericyet and (word == '-' or word == '—' or word == '—-'): sentimentbits.append('-') continue if not numericyet and (word == '=-' or word == '--' or word == '-—'): sentimentbits.append('-') continue if not numericyet and (word == '==' or word == '=--' or word == '—-'): sentimentbits.append('-') continue if not numericyet and (word == '+-' or word == '+—' or word == '+='): sentimentbits.append('+') sentimentbits.append('-') continue if not numericyet and (word == '-+' or word == "—+" or word == '=+'): sentimentbits.append('-') sentimentbits.append('+') continue if not numericyet and (word == '++-' or word == '++—' or word == "++="): sentimentbits.append('+') sentimentbits.append('+') continue sentimentbits.append('-') if not numericyet and (word == '++'): sentimentbits.append('+') sentimentbits.append('+') continue if not numericyet and (word == '+++'): sentimentbits.append('+') sentimentbits.append('+') sentimentbits.append('+') continue if not numericyet and nextwordctr == 1 and word == "H": # this is a weird but common misread; however, it's risky # enough that we should only do it in first position sentimentbits.append('+') sentimentbits.append('-') continue if 'somenumeric' in tags: numericyet = True if not numericyet: publisherbits.append(word) else: citationbits.append(word) if numericyet and 'wordcount' in tags and ( nextwordctr < len(taglist.stringseq)): addtonext = ' '.join(taglist.stringseq[nextwordctr:]) break # if this line doesn't end with a word count, and the next one does? # probably a continuation if len(citationbits) > 0 and not wordcountregex.fullmatch( citationbits[-1]): if linecount < (len(lines) - 1): wordsinnextline = lines[linecount + 1].strip().split() if len(wordsinnextline ) > 0 and wordcountregex.fullmatch( wordsinnextline[-1]): citationbits.extend(wordsinnextline) skipnext = True sentiment = ' '.join(sentimentbits) review = ' '.join(publisherbits) cite = ' '.join(citationbits) citationcount += 1 quote = Quotation(book, review, sentiment, cite, accumulated) allquotes.append(quote) accumulated = [] else: # odds of review 1 or less accumulated.append(line) return allquotes
def get_books(pagelist, publishers): lexical_patterns = [('numeric', '.?[0-9]{1,7}.?[0-9]*[,.:=]?'), \ ('genreword', {'reviewed', 'by', 'review.', 'review', 'by,', 'article.', 'article', 'article-non', 'lit.', 'non-lit.', 'poem.', 'fict.', 'fiction', 'fict', 'fiction.', 'poem'}), ('openparen', '\(.*'), ('closeparen', '.*\)'), ('fullstop', '\S+[\.\?!]'), ('commastop', '.*\,'), ('startdash', '—.*'), ('numeric', {'I:', 'II:', 'III:', 'IV:'}), ('titlecase', '[A-Z].*'), ('monthabbrev', {'Ja', 'F', 'Mr', 'Ap', 'My', 'Je', 'Jl', 'Ag', 'S', 'O', 'N', 'D'}), ('lineendingyear', '[\'"•■]\d+'), ('volandpgrange', '[0-9]+[:][0-9-]+'), ('somenumeric', '.?.?.?[0-9]{1,7}.?.?[0-9]*.?'), ('allcaps', '[[A-Z\'\,\‘\.\-:;]*[A-Z]{2,}[A-Z\'\,\‘\.\:;]*'), ('dollarprice', '.{0,2}[$\"\'\“].?.?[0-9]{1,7}.?[0-9]*[,.:=]?'), ('centprice', '.?.?[0-9]{1,7}.?[0-9]*c+[,.:=]?'), ('hyphennumber', '.?[0-9]{1,2}[-—~]+[0-9]{3,7}.?'), ('openquote', '[\"\'“‘]+\S*'), ('deweydecimal', '[0-9]{3}[.][0-9-]+'), ('numpages', '\d{2,5}p') ] rule_list = lexparse.patterns2rules(lexical_patterns) # Our strategy is to find pairs of lines that bookend a citation. A # citation and the lines that follow it (before the next citation) # is going to count as an Author. # A citation starts with a line whose first word is either capitalized and # followed by a comma, or all uppercase and longer than three characters. # It should be alphabetically later than the last author-name. Exceptions # are flagged. # A citation ends with a line (up to five lines later) that contains # $ + number or number + c, or number containing a hyphen, or that ends # with a publisher. # We create a list of author_names as well as a list of Author objects. author_errors = [] books = [] # We keep track of the last author name we've seen; it's going to be # the name that governs the lines we are currently examining. last_author_name = '' textpage = 1 reviewlines = [] citation_started = False citation_finished = False citationlines = [] governing_citation = Citation(['Anonymous. My book. $1.00. Macmillan.'], rule_list, textpage) aligned = 0 for pagenum, page in enumerate(pagelist): for linenum, line in enumerate(page): line = line.strip() this_line_is_new_citation = False # Line numbers are only relevant in guiding us to ignore the running header, # and to update the page number. This will be imperfect, because OCR, # but that's okay. If we get four successive pages that increment each # other as they should (10, 11, 12, 13, ...), we say that enough have been aligned to # override checking and just keep adding one per page. if aligned >= 4 and linenum == 0: textpage += 1 if linenum < 4: thematch = match_strings('BOOK REVIEW DIGEST', line) if thematch > 0.8 and len(line) > 7: wordsinline = line.split() if len(wordsinline) > 3 and wordsinline[0].isdigit(): pagenum = int(wordsinline[0]) elif len(wordsinline) > 3 and wordsinline[-1].isdigit(): pagenum = int(wordsinline[-1]) if textpage + 1 == pagenum: aligned += 1 textpage = pagenum elif pagenum < 20: textpage = pagenum continue # skip this line if len(line) > 15: thetail = line[-11:] thematch = match_strings('—Continued.', thetail) if thematch > 0.75: continue if line.isdigit() and aligned < 4: try: newtextpage = int(line) if textpage + 1 == newtextpage: aligned += 1 else: aligned = 0 textpage = newtextpage except: textpage += 1 elif len(line) > 12 and line[-2:].isdigit() and aligned < 4: words = line.split() if words[-1].isdigit(): pagenumpart = words[-1] try: newtextpage = int(pagenumpart) if textpage + 1 == newtextpage: aligned += 1 else: aligned = 0 textpage = newtextpage except: textpage += 1 elif len(line) > 12 and line[0:2].isdigit() and aligned < 4: words = line.split() if words[0].isdigit(): pagenumpart = words[0] try: newtextpage = int(pagenumpart) if textpage + 1 == newtextpage: aligned += 1 else: aligned = 0 textpage = newtextpage except: textpage += 1 if line.startswith('Figures in parenth'): continue tokens = line.split() if len(tokens) < 1: continue # There are things that look like the beginning of a citation but # are actually cross-references if "See" in line: skepticalofcitestart = True else: skepticalofcitestart = False nextline = linenum + 1 if nextline < len(page) and "See" in page[nextline]: skepticalofcitestart = True cannotcitestart = False if skepticalofcitestart and linenum + 5 < len(page): for lookforward in range(1, 5): futureline = page[linenum + lookforward] if percent_upper(futureline) > .3: cannotcitestart = True taglist = lexparse.apply_rule_list(rule_list, tokens) firstword = taglist.stringseq[0] firsttagset = taglist.tagseq[0] cluescitationahead = 0 distancetolook = 6 if (len(page) - linenum) < distancetolook: distancetolook = len(page) - linenum if 'allcaps' in firsttagset and len( firstword) > 2 and distancetolook > 1: for lookforward in range(1, distancetolook): futureline = page[linenum + lookforward] if '$' in futureline or "ed." in futureline: cluescitationahead += 1 if cluescitationahead > 0: futuretokens = futureline.split() for t in futuretokens: if hyphenregex.fullmatch(t): cluescitationahead += 1 if not citation_started and not cannotcitestart: allcapcount = 0 for tags in taglist.tagseq: if 'allcaps' in tags: allcapcount += 1 lineuppercasepct = percent_upper(line) if (allcapcount > 0 and lineuppercasepct > 0.1 and len(line) > 9) or ( lineuppercasepct > 0.6 and len(line) > 9) or (allcapcount > 0 and cluescitationahead > 0 and len(line) > 9): percentageupper = percent_upper(firstword) if len(line) > 15: pctupin15 = percent_upper(line[0:15]) else: pctupin15 = 0 if 'allcaps' in firsttagset and len( firstword) > 2 and cluescitationahead > 0: this_line_is_new_citation = True elif lineuppercasepct > 0.72 and len(line) > 14: this_line_is_new_citation = True elif pctupin15 > .35 and ('$' in line or cluescitationahead > 1) and len(reviewlines) > 3: this_line_is_new_citation = True elif percentageupper > 0.7 and len( firstword) > 4 and allcapcount > 2: this_line_is_new_citation = True elif pctupin15 > 0.65: this_line_is_new_citation = True else: reviewlines.append(line) else: reviewlines.append(line) if this_line_is_new_citation: # a new citation has begun citation_finished = False citationlines = [] for string, tags in zip(taglist.stringseq, taglist.tagseq): if 'dollarprice' in tags or 'centprice' in tags: # note that our conditions for ending a citation with the first # line are more stringent than they will be from the second onward citation_finished = True break elif not citation_started and cannotcitestart: reviewlines.append(line) else: # if a citation has been started, let's see if we should end it for string, tags in zip(taglist.stringseq, taglist.tagseq): if 'dollarprice' in tags or 'centprice' in tags or 'hyphennumber' in tags or 'deweydecimal' in tags: # more conditions can end a citation now citation_finished = True break if len(taglist.stringseq) > 1 and taglist.stringseq[-1].strip( '.') in publishers: # sometimes there's no price and the publisher's name is the only clue # that the citation is finished citation_finished = True if len(citationlines) > 2 and len( taglist.tagseq ) > 1 and 'somenumeric' in taglist.tagseq[0]: try: deweydecimal = float(taglist.stringseq[0]) if deweydecimal > 99: citation_finished = True except: pass if this_line_is_new_citation or citation_started: citationlines.append(line) citation_started = True this_line_is_new_citation = False if citation_finished: # we have concluded a new citation # first, make the last citation into a book: thisbook = Book(governing_citation, reviewlines) books.append(thisbook) # initialize reviewlines, and create a new citation reviewlines = [] citation_started = False citation_finished = False # we finished that citation, started a new one governing_citation = Citation(citationlines, rule_list, textpage) citationlines = [] new_author = governing_citation.author if new_author < last_author_name: author_errors.append( (textpage, last_author_name, new_author)) last_author_name = new_author elif len(citationlines) > 8: # this is too many lines, and we were probably in error to have # started the citation, so put those lines back in reviewlines. # This is esp. likely to happen at the top of a page, when # an entry is "continued." reviewlines.extend(citationlines) citationlines = [] citation_started = False elif len(citationlines) > 2: lineuppercasepct = percent_upper(line) lastuppercasepct = percent_upper(citationlines[-2]) if lineuppercasepct > .45 and cluescitationahead > 0 and len( line) > 12 and lastuppercasepct < .45: # we started a citation in error two or more lines back; this is the actual # citation start # notice that we check the pct uppercase of last line to make sure this isn't # just a long multiline author name! # discarded = citationlines[0: -1] # for d in discarded: # print(d) citationlines = [citationlines[-1]] return books, author_errors
def divide_into_quotations(booklist): all_reviewwords, reviewdict = read_pubnames.get_names( 'brd_pubs_indexed1920s.tsv') longreviewnames = set() for rev in reviewdict.keys(): reviewparts = rev.split() if len(reviewparts) < 1: continue elif len(reviewparts[0]) > 4: longreviewnames.add(reviewparts[0]) publishers = [ 'Liverlght', 'Appleton', 'Baker', 'Barnes', 'Benziger', 'Bobbs', "Brentano's", 'Cassell', 'Century', 'Collier-Fox', 'Crowell', 'Ditson', 'Dodd', 'Doran', 'Doubleday', 'Dutton', 'Elder', 'Estes', 'Ginn', 'Goodspeed', 'Harper', 'Heath', 'Holt', 'Houghton', 'Knopf', 'Lane', 'Lippincott', 'Little', 'Liveright', 'Longmans', 'Macmillan', 'McBride', 'McClure', 'McGraw', 'Moffat', 'Oxford', 'Page', 'Pott', 'Putnam', 'Scribner', 'Simmons', 'Stokes', 'Walton', 'Warne', 'Wessels', 'Wilde', 'Wiley', 'Winston', 'Yale' ] lexical_patterns = [('numeric', '.?[0-9]{1,7}.?[0-9]*[,.:=]?'), \ ('reviewword', all_reviewwords), ('openparen', '\(.*'), ('closeparen', '.*\)'), ('fullstop', '.*\.'), ('commastop', '.*\,'), ('startdash', '—.*'), ('numeric', {'I:', 'II:', 'III:', 'IV:'}), ('titlecase', '[A-Z].*'), ('monthabbrev', {'Ja', 'F', 'Mr', 'Ap', 'My', 'Je', 'Jl', 'Ag', 'S', 'O', 'N', 'D'}), ('lineendingyear', '[\'"•■]\d+'), ('volandpgrange', '[0-9]+[:][0-9-]+'), ('somenumeric', '.?.?[0-9]{1,7}.?.?[0-9]*.?'), ('allcaps', '[A-Z\'\,]+'), ('dollarprice', '.*\$.?.?[0-9]{1,7}.?[0-9]*[,.:=]?'), ('centprice', '.?.?[0-9]{1,7}.?[0-9]*c+[,.:=]?'), ('hyphennumber', '[0-9]{1,3}[-—~]+[0-9]{3,7}[,.:=)]?'), ('openquote', '[\"\'“‘]+\S*'), ('plusorminus', '[\+\-\—]+'), ('reviewword', all_reviewwords), ('wordcount', '\d*0w[.]?'), ('OCRwordcount', '\S*Ow[.]?') ] wordcountregex = re.compile('\d*0w[.]?') ocrwordcountregex = re.compile('\S*Ow[.]?') rule_list = lexparse.patterns2rules(lexical_patterns) allquotes = [] plusmisreads = { '-4-', '4-', '1-', '-1-', '4—', '1—', '-|-', '-l-', '-)-', '—|—', '-I-', '-(-', '-f' } for book in booklist: lines = book.reviewlines accumulated = [] citationcount = 0 addtonext = '' skipnext = False for linecount, line in enumerate(lines): # We keep track of linecount because there are # characteristic kinds of noise early on, when trailing lines # of a citation get treated as part of the review. if len(addtonext) > 0: line = addtonext + ' ' + line addtonext = '' if skipnext: skipnext = False continue tokens = line.strip().split() if len(tokens) < 1: continue taglist = lexparse.apply_rule_list(rule_list, tokens) # in the first two lines we often have fragments # left over from the book bibliographical entry if linecount <= 3: trailingbibline = False for tags in taglist.tagseq: if 'hyphennumber' in tags or 'dollarprice' in tags or 'centprice' in tags: trailingbibline = True if trailingbibline: # get the existing publisher to see if it makes more sense # fused with something in this trailing line existingpubparts = book.publisher.split() if len(existingpubparts) > 0: existingpub = existingpubparts[-1].strip('-') else: existingpub = 'not a publisher' tokenssofar = [] for l in accumulated: tokenssofar.extend(l.strip().split()) tokenssofar.extend(tokens) tokenssofar = [x.strip('.,[]()-') for x in tokenssofar] for tok in tokenssofar: if tok in publishers: book.publisher = tok rejoined = existingpub + tok if rejoined in publishers: book.publisher = book.publisher.strip('-') + tok line = line + ' <endsubj>' accumulated.append(line) continue # Sometimes a book is followed by a summary that # is not attributed to any particular review. # The only way I have to identify this is, # that a) this is the first sequence of lines and # b) the next line opens with a quotation mark, # and there has been no other citation info provided # yet. if citationcount == 0 and len(accumulated) > 3: if 'openquote' in taglist.tagseq[0]: sentiment = '' review = 'summary' cite = 'summary' citationcount += 1 quote = Quotation(book, review, sentiment, cite, accumulated) allquotes.append(quote) accumulated = [] accumulated.append(line) # this line (opening with a quote) will be part of the next quotation matched = True continue numberwords = 0 reviewwords = 0 plusyet = False totalclues = 0 for word, tags in zip(taglist.stringseq, taglist.tagseq): if 'reviewword' in tags: reviewwords += 1 totalclues += 1 elif 'plusorminus' in tags and not plusyet: reviewwords += 0.5 totalclues += 1 plusyet = True elif 'monthabbrev' in tags: totalclues += 1 elif 'somenumeric' in tags and not '-' in word and not ',' in word: numberwords += 1 totalclues += 1 if word.endswith('w'): totalclues += 1 reviewwords += 0.5 elif ':' in word: totalclues += 1 reviewwords += 0.5 elif word.startswith('p'): totalclues += 1 reviewwords += 0.5 # fuzzy match in situations where everything is there except the review # because it could easily be ocr error if numberwords > 0 and totalclues > 2 and reviewwords < 0.9: firstword = taglist.stringseq[0] if len(firstword) > 3: for longname in longreviewnames: similarity = match_strings(firstword, longname) if similarity > .7: reviewwords += 1 totalclues += 1 break if numberwords > 0 and reviewwords > 0.9 and totalclues > 3: sentimentbits = [] numericyet = False publisherbits = [] citationbits = [] nextwordctr = 0 for word, tags in zip(taglist.stringseq, taglist.tagseq): nextwordctr += 1 if not numericyet and word == '+': sentimentbits.append('+') continue if not numericyet and word in plusmisreads: # e.g. '4-' is a fairly common ocr misread for + sentimentbits.append('+') continue if not numericyet and (word == '-' or word == '—' or word == '—-'): sentimentbits.append('-') continue if not numericyet and (word == '=-' or word == '--' or word == '-—'): sentimentbits.append('-') continue if not numericyet and (word == '==' or word == '=--' or word == '—-'): sentimentbits.append('-') continue if not numericyet and (word == '+-' or word == '+—' or word == '+='): sentimentbits.append('+') sentimentbits.append('-') continue if not numericyet and (word == '-+' or word == "—+" or word == '=+'): sentimentbits.append('-') sentimentbits.append('+') continue if not numericyet and (word == '++-' or word == '++—' or word == "++="): sentimentbits.append('+') sentimentbits.append('+') continue sentimentbits.append('-') if not numericyet and (word == '++'): sentimentbits.append('+') sentimentbits.append('+') continue if not numericyet and (word == '+++'): sentimentbits.append('+') sentimentbits.append('+') sentimentbits.append('+') continue if not numericyet and nextwordctr == 1 and word == "H": # this is a weird but common misread; however, it's risky # enough that we should only do it in first position sentimentbits.append('+') sentimentbits.append('-') continue if 'somenumeric' in tags: numericyet = True if not numericyet: publisherbits.append(word) else: citationbits.append(word) if numericyet and ('wordcount' in tags or 'OCRwordcount' in tags) and (nextwordctr < len( taglist.stringseq)): addtonext = ' '.join(taglist.stringseq[nextwordctr:]) break # if this line doesn't end with a word count, and the next one does? # probably a continuation if len(citationbits) > 0 and not wordcountregex.fullmatch( citationbits[-1]) and not ocrwordcountregex.fullmatch( citationbits[-1]): if linecount < (len(lines) - 1): wordsinnextline = lines[linecount + 1].strip().split() if len(wordsinnextline) > 0 and len( wordsinnextline ) < 3 and wordcountregex.fullmatch( wordsinnextline[-1]): citationbits.extend(wordsinnextline) skipnext = True sentiment = ' '.join(sentimentbits) review = ' '.join(publisherbits) cite = ' '.join(citationbits) citationcount += 1 quote = Quotation(book, review, sentiment, cite, accumulated) allquotes.append(quote) accumulated = [] else: # odds of review 1 or less accumulated.append(line) return allquotes
for cite in citation_list: parse_citation(cite, startinitial, endinitial) return citation_list ## MAIN paths = glob.glob('../pooles/poolesclean*txt') citations = [] start_headword = '' end_headword = '' rule_list = lexparse.patterns2rules(lexical_patterns) for p in paths: with open(p, encoding='utf-8') as f: filelines = f.readlines() page = [] for fl in filelines: fl = fl.replace('.—', '. —') fl = fl.replace('—', '— ') fl = fl.replace(',', ', ') # The point of doing that is to make # sure citations get cleanly separated. # They often begin with a dash. tokens = fl.strip().split() if len(tokens) < 1 and len(page) > 0: