def rm_lb_hyphens(plain_root, logger, ignore=[".json", ".log", ".err"]): """ Looks for a hyphen followed by whitespace or a line break. Reconstructs word and checks to see if the result exists in either WordNet or the OS's default spellchecker dictionary. If so, replaces fragments with reconstructed word. :param plain_root: The name of the directory containing plain-text files. :type plain_root: string :param logger: Logger that handles logging for the given directory. :type logger: Logger :param ignore: List of file extensions to ignore in the directory. :type ignore: list of strings, optional :returns: None """ try: d = enchant.Dict("en_US") except ImportError: d = None def recon(match_obj): rc_word = match_obj.group(1) + match_obj.group(2) if wn.synsets(rc_word) or (d and d.check(rc_word)): logger.info("\nbook: %s\nreconstructed word:\n%s\n", plain_root, rc_word) return rc_word logger.info( "\nbook: %s\nignored expression:\nleft: %s\nright: %s\n", plain_root, match_obj.group(1), match_obj.group(2) ) return match_obj.group(0) def inner(s): lb_hyphenated = re.compile(r"(\w+)-\s+(\w+)") return lb_hyphenated.sub(recon, s) page_files = os.listdir(plain_root) page_files = filter_by_suffix(page_files, ignore) for i, page_file in enumerate(page_files): filename = os.path.join(plain_root, page_file) with open(filename, "r+w") as f: page = f.read() page = inner(page) f.seek(0) f.write(page) f.truncate()
def rm_lb_hyphens(plain_root, logger, ignore=['.json', '.log', '.err']): """ Looks for a hyphen followed by whitespace or a line break. Reconstructs word and checks to see if the result exists in either WordNet or the OS's default spellchecker dictionary. If so, replaces fragments with reconstructed word. :param plain_root: The name of the directory containing plain-text files. :type plain_root: string :param logger: Logger that handles logging for the given directory. :type logger: Logger :param ignore: List of file extensions to ignore in the directory. :type ignore: list of strings, optional :returns: None """ try: d = enchant.Dict('en_US') except ImportError: d = None def recon(match_obj): rc_word = match_obj.group(1) + match_obj.group(2) if wn.synsets(rc_word) or (d and d.check(rc_word)): logger.info('\nbook: %s\nreconstructed word:\n%s\n', plain_root, rc_word) return rc_word logger.info('\nbook: %s\nignored expression:\nleft: %s\nright: %s\n', plain_root, match_obj.group(1), match_obj.group(2)) return match_obj.group(0) def inner(s): lb_hyphenated = re.compile(r'(\w+)-\s+(\w+)') return lb_hyphenated.sub(recon, s) page_files = os.listdir(plain_root) page_files = filter_by_suffix(page_files, ignore) for i, page_file in enumerate(page_files): filename = os.path.join(plain_root, page_file) with open(filename, 'r+w') as f: page = f.read() page = inner(page) f.seek(0) f.write(page) f.truncate()
def url_metadata(corpus, ctx_type, coll_dir): """ Returns a list of urls whose order matches with the existing metadata. It creates url metadata that can be added to a Corpus object with add_metadata function in vsm.ext.corpusbuilders.util. :param corpus: Corpus to add url metadata to. Urls match with the existing metadata of `corpus`. :type corpus: Corpus :param ctx_type: A type of tokenization. :type ctx_type: string :param coll_dir: Path for the collection directory. Either htrc 86 plain or htrc 1315 plain directory. :type coll_dir: string :returns: md : List of urls to be added to corpus :See Also: :meth: add_metadata """ import json from vsm.viewer import doc_label_name import re urls = [] corp_md = corpus.view_metadata('book') book_labels = corp_md[doc_label_name('book')] for book_label in book_labels: coll_path = os.path.join(coll_dir, book_label) booklist = os.listdir(coll_path) book = filter_by_suffix(booklist, ignore=['.txt', '.pickle']) book_path = os.path.join(coll_path, book[0]) with open(book_path, 'r') as f: d = json.load(f) url = '' li = sorted(d['items'], key=lambda k: int(k['lastUpdate'])) url = li[-1]['itemURL'] if ctx_type == 'book': urls.append(unidecode(url)) else: # urls for pages page_md = corpus.view_metadata('page') files = [ a for a in page_md['file'] if a.startswith(book_label) ] nums = [re.findall('[1-9][0-9]*', a)[-1] for a in files] for i in nums: s = url + '?urlappend=%3Bseq={0}'.format(i) urls.append(unidecode(s)) return urls
def url_metadata(corpus, ctx_type, coll_dir): """ Returns a list of urls whose order matches with the existing metadata. It creates url metadata that can be added to a Corpus object with add_metadata function in vsm.ext.corpusbuilders.util. :param corpus: Corpus to add url metadata to. Urls match with the existing metadata of `corpus`. :type corpus: Corpus :param ctx_type: A type of tokenization. :type ctx_type: string :param coll_dir: Path for the collection directory. Either htrc 86 plain or htrc 1315 plain directory. :type coll_dir: string :returns: md : List of urls to be added to corpus :See Also: :meth: add_metadata """ import json from vsm.viewer import doc_label_name import re urls = [] corp_md = corpus.view_metadata("book") book_labels = corp_md[doc_label_name("book")] for book_label in book_labels: coll_path = os.path.join(coll_dir, book_label) booklist = os.listdir(coll_path) book = filter_by_suffix(booklist, ignore=[".txt", ".pickle"]) book_path = os.path.join(coll_path, book[0]) with open(book_path, "r") as f: d = json.load(f) url = "" li = sorted(d["items"], key=lambda k: int(k["lastUpdate"])) url = li[-1]["itemURL"] if ctx_type == "book": urls.append(unidecode(url)) else: # urls for pages page_md = corpus.view_metadata("page") files = [a for a in page_md["file"] if a.startswith(book_label)] nums = [re.findall("[1-9][0-9]*", a)[-1] for a in files] for i in nums: s = url + "?urlappend=%3Bseq={0}".format(i) urls.append(unidecode(s)) return urls
def proc_htrc_coll(coll_dir, ignore=[".json", ".log", ".err"]): """ Given a collection, cleans up plain page files for books in the collection. :param coll_dir: The path for collection. :type coll_dir: string :param ignore: List of file extensions to ignore in the directory. :type ignore: list of strings, optional :returns: None :See Also: :meth: proc_htrc_book """ books = os.listdir(coll_dir) books = filter_by_suffix(books, ignore) for book in books: # For debugging # if book == 'uc2.ark+=13960=t1zc80k1p': # if book == 'uc2.ark+=13960=t8tb11c8g': # if book == 'uc2.ark+=13960=t74t6gz6r': proc_htrc_book(book, coll_dir, ignore=ignore)
def proc_htrc_coll(coll_dir, ignore=['.json', '.log', '.err']): """ Given a collection, cleans up plain page files for books in the collection. :param coll_dir: The path for collection. :type coll_dir: string :param ignore: List of file extensions to ignore in the directory. :type ignore: list of strings, optional :returns: None :See Also: :meth: proc_htrc_book """ books = os.listdir(coll_dir) books = filter_by_suffix(books, ignore) for book in books: # For debugging # if book == 'uc2.ark+=13960=t1zc80k1p': # if book == 'uc2.ark+=13960=t8tb11c8g': # if book == 'uc2.ark+=13960=t74t6gz6r': proc_htrc_book(book, coll_dir, ignore=ignore)
def rm_pg_headers(plain_root, logger, bound=1, ignore=[".json", ".log", ".err"]): """ Tries to detect repeated page headers (e.g., chapter titles). If found, removes them. The routine takes the first non-empty lines of text, strips them of numbers and punctuation and computes frequencies. If frequency for the reduced string exceeds `bound`, the corresponding first lines are considered headers. :param plain_root: The name of the directory containing plain-text files. :type plain_root: string :param logger: Logger that handles logging for the given directory. :type logger: Logger :param bound: Number of frequency of a reduced string. If the string appears more than `bound`, then the corresponding first lines are considered headers. Default is 1. :param bound: int, optional :param ignore: List of file extensions to ignore in the directory. :type ignore: list of strings, optional :returns: None """ page_files = os.listdir(plain_root) page_files = filter_by_suffix(page_files, ignore) # Get first non-empty lines first_lines = [] fl = re.compile(r"^\s*([^\n]*)\n") for page_file in page_files: page_file = os.path.join(plain_root, page_file) with open(page_file, "r") as f: page = f.read() first_line = fl.match(page) if first_line == None: first_lines.append("") else: first_lines.append(first_line.group(0)) # Remove capitalization, roman numerals for numbers under 50, # punctuation, arabic numerals from first lines for i in xrange(len(first_lines)): line = first_lines[i] line = line.lower() # An overzealous arabic numeral detector (OCR errors include # `i` for `1` for example) line = re.sub(r"\b\S*\d+\S*\b", "", line) # Roman numerals i to xxxix line = re.sub(r"\b(x{0,3})(ix|iv|v?i{0,3})\b", "", line) # Collapse line to letters only line = re.sub(r"[^a-z]", "", line) first_lines[i] = (first_lines[i], line) freqs = dict() for line, reduced in first_lines: if reduced in freqs: freqs[reduced] += 1 else: freqs[reduced] = 1 for i, page_file in enumerate(page_files): filename = os.path.join(plain_root, page_file) line, reduced = first_lines[i] if freqs[reduced] > bound: with open(filename, "r") as f: page = f.read() if page: logger.info( u"\nbook: %s\nfile: %s\nremoved header:\n%s\n", unidecode(plain_root), unidecode(page_file), unidecode(line), ) page = fl.sub("", page) with open(filename, "w") as f: f.write(page)
def rm_pg_headers(plain_root, logger, bound=1, ignore=['.json', '.log', '.err']): """ Tries to detect repeated page headers (e.g., chapter titles). If found, removes them. The routine takes the first non-empty lines of text, strips them of numbers and punctuation and computes frequencies. If frequency for the reduced string exceeds `bound`, the corresponding first lines are considered headers. :param plain_root: The name of the directory containing plain-text files. :type plain_root: string :param logger: Logger that handles logging for the given directory. :type logger: Logger :param bound: Number of frequency of a reduced string. If the string appears more than `bound`, then the corresponding first lines are considered headers. Default is 1. :param bound: int, optional :param ignore: List of file extensions to ignore in the directory. :type ignore: list of strings, optional :returns: None """ page_files = os.listdir(plain_root) page_files = filter_by_suffix(page_files, ignore) # Get first non-empty lines first_lines = [] fl = re.compile(r'^\s*([^\n]*)\n') for page_file in page_files: page_file = os.path.join(plain_root, page_file) with open(page_file, 'r') as f: page = f.read() first_line = fl.match(page) if first_line == None: first_lines.append('') else: first_lines.append(first_line.group(0)) # Remove capitalization, roman numerals for numbers under 50, # punctuation, arabic numerals from first lines for i in range(len(first_lines)): line = first_lines[i] line = line.lower() # An overzealous arabic numeral detector (OCR errors include # `i` for `1` for example) line = re.sub(r'\b\S*\d+\S*\b', '', line) # Roman numerals i to xxxix line = re.sub(r'\b(x{0,3})(ix|iv|v?i{0,3})\b', '', line) # Collapse line to letters only line = re.sub(r'[^a-z]', '', line) first_lines[i] = (first_lines[i], line) freqs = dict() for line, reduced in first_lines: if reduced in freqs: freqs[reduced] += 1 else: freqs[reduced] = 1 for i, page_file in enumerate(page_files): filename = os.path.join(plain_root, page_file) line, reduced = first_lines[i] if freqs[reduced] > bound: with open(filename, 'r') as f: page = f.read() if page: logger.info(u'\nbook: %s\nfile: %s\nremoved header:\n%s\n', unidecode(plain_root), unidecode(page_file), unidecode(line)) page = fl.sub('', page) with open(filename, 'w') as f: f.write(page)