def search_corpus(pattern, corpus, context, case_insensitive=True, expand_keyword=False, lemmatized=False, threshold=0.70): """Search for pattern in TLG or PHI5. TODO: Cleanup hyphenation. """ corpora = ['tlg', 'phi5'] assert corpus in corpora, "Available corpora: '{}'.".format(corpora) if type(context) is str: contexts = ['sentence', 'paragraph'] assert context in contexts or type(context) is int, 'Available contexts: {}'.format(contexts) else: context = int(context) if corpus == 'phi5': lang = 'latin' index = PHI5_INDEX paths = assemble_phi5_author_filepaths() elif corpus == 'tlg': index = TLG_INDEX lang = 'greek' paths = assemble_tlg_author_filepaths() if expand_keyword: # Strip off all regex characters from pattern for Word2Vec lookup # First rm escaped chars # TODO: Add '\u', '\U', '\x' to this list escapes_list = [r'\a', r'\b', r'\f', r'\n', r'\r', r'\t', r'\v', r'\\'] escapes_str = '|'.join(escapes_list) comp_escapes = regex.compile(escapes_str, flags=regex.VERSION1) pattern = comp_escapes.sub('', pattern) # Second rm remaining punctuation punctuation = set(string.punctuation) pattern = ''.join(ch for ch in pattern if ch not in punctuation) similar_vectors = _keyword_expander(pattern, lang, lemmatized=lemmatized, threshold=threshold) print("The following similar terms will be added to the '{0}' query: '{1}'.".format(pattern, similar_vectors)) pattern = [pattern] if similar_vectors: pattern += similar_vectors else: pattern = pattern for path in paths: with open(path) as file_open: text = file_open.read() for one_pattern in pattern: _matches = match_regex(text, one_pattern, language=lang, context=context, case_insensitive=case_insensitive) for _match in _matches: _id = os.path.split(path)[1][:-4] author = index[_id] yield (author, _match)
def _assemble_corpus_string(self, corpus): """Takes a list of filepaths, returns a string containing contents of all files.""" if corpus == 'phi5': filepaths = assemble_phi5_author_filepaths() file_cleaner = phi5_plaintext_cleanup elif corpus == 'tlg': filepaths = assemble_tlg_author_filepaths() file_cleaner = tlg_plaintext_cleanup for filepath in filepaths: with open(filepath) as file_open: file_read = file_open.read().lower() file_clean = file_cleaner(file_read) yield file_clean
def test_assemble_tlg_author(self): """Test building absolute filepaths from TLG index.""" paths = assemble_tlg_author_filepaths() self.assertEqual(len(paths), 1823)
def gen_docs(corpus, lemmatize, rm_stops): """Open and process files from a corpus. Return a list of sentences for an author. Each sentence is itself a list of tokenized words. """ assert corpus in ['phi5', 'tlg'] if corpus == 'phi5': language = 'latin' filepaths = assemble_phi5_author_filepaths() jv_replacer = JVReplacer() text_cleaner = phi5_plaintext_cleanup word_tokenizer = WordTokenizer('latin') if rm_stops: stops = latin_stops else: stops = None elif corpus == 'tlg': language = 'greek' filepaths = assemble_tlg_author_filepaths() text_cleaner = tlg_plaintext_cleanup word_tokenizer = WordTokenizer('greek') if rm_stops: stops = latin_stops else: stops = None if lemmatize: lemmatizer = LemmaReplacer(language) sent_tokenizer = TokenizeSentence(language) for filepath in filepaths: with open(filepath) as f: text = f.read() # light first-pass cleanup, before sentence tokenization (which relies on punctuation) text = text_cleaner(text, rm_punctuation=False, rm_periods=False) sent_tokens = sent_tokenizer.tokenize_sentences(text) # doc_sentences = [] for sentence in sent_tokens: # a second cleanup at sentence-level, to rm all punctuation sentence = text_cleaner(sentence, rm_punctuation=True, rm_periods=True) sentence = word_tokenizer(sentence) sentence = [s.lower() for s in sentence] sentence = [w for w in sentence if w] if language == 'latin': sentence = [w[1:] if w.startswith('-') else w for w in sentence] if stops: sentence = [w for w in sentence if w not in stops] sentence = [w for w in sentence if len(w) > 1] # rm short words if sentence: sentence = sentence if lemmatize: sentence = lemmatizer.lemmatize(sentence) if sentence and language == 'latin': sentence = [jv_replacer.replace(word) for word in sentence] if sentence: yield sentence
def search_corpus(pattern, corpus, context, case_insensitive=True, expand_keyword=False, lemmatized=False, threshold=0.70): """Search for pattern in TLG or PHI5. TODO: Cleanup hyphenation. """ corpora = ['tlg', 'phi5'] assert corpus in corpora, "Available corpora: '{}'.".format(corpora) if type(context) is str: contexts = ['sentence', 'paragraph'] assert context in contexts or type( context) is int, 'Available contexts: {}'.format(contexts) else: context = int(context) if corpus == 'phi5': lang = 'latin' index = PHI5_INDEX paths = assemble_phi5_author_filepaths() elif corpus == 'tlg': index = TLG_INDEX lang = 'greek' paths = assemble_tlg_author_filepaths() if expand_keyword: # Strip off all regex characters from pattern for Word2Vec lookup # First rm escaped chars # TODO: Add '\u', '\U', '\x' to this list escapes_list = [r'\a', r'\b', r'\f', r'\n', r'\r', r'\t', r'\v', r'\\'] escapes_str = '|'.join(escapes_list) comp_escapes = regex.compile(escapes_str, flags=regex.VERSION1) pattern = comp_escapes.sub('', pattern) # Second rm remaining punctuation punctuation = set(string.punctuation) pattern = ''.join(ch for ch in pattern if ch not in punctuation) similar_vectors = _keyword_expander(pattern, lang, lemmatized=lemmatized, threshold=threshold) print( "The following similar terms will be added to the '{0}' query: '{1}'." .format(pattern, similar_vectors)) pattern = [pattern] if similar_vectors: pattern += similar_vectors else: pattern = pattern for path in paths: with open(path) as file_open: text = file_open.read() for one_pattern in pattern: _matches = match_regex(text, one_pattern, language=lang, context=context, case_insensitive=case_insensitive) for _match in _matches: _id = os.path.split(path)[1][:-4] author = index[_id] yield (author, _match)
def gen_docs(corpus, lemmatize, rm_stops): """Open and process files from a corpus. Return a list of sentences for an author. Each sentence is itself a list of tokenized words. """ assert corpus in ['phi5', 'tlg'] if corpus == 'phi5': language = 'latin' filepaths = assemble_phi5_author_filepaths() jv_replacer = JVReplacer() text_cleaner = phi5_plaintext_cleanup word_tokenizer = nltk_tokenize_words if rm_stops: stops = latin_stops else: stops = None elif corpus == 'tlg': language = 'greek' filepaths = assemble_tlg_author_filepaths() text_cleaner = tlg_plaintext_cleanup word_tokenizer = nltk_tokenize_words if rm_stops: stops = latin_stops else: stops = None if lemmatize: lemmatizer = LemmaReplacer(language) sent_tokenizer = TokenizeSentence(language) for filepath in filepaths: with open(filepath) as f: text = f.read() # light first-pass cleanup, before sentence tokenization (which relies on punctuation) text = text_cleaner(text, rm_punctuation=False, rm_periods=False) sent_tokens = sent_tokenizer.tokenize_sentences(text) # doc_sentences = [] for sentence in sent_tokens: # a second cleanup at sentence-level, to rm all punctuation sentence = text_cleaner(sentence, rm_punctuation=True, rm_periods=True) sentence = word_tokenizer(sentence) sentence = [s.lower() for s in sentence] sentence = [w for w in sentence if w] if language == 'latin': sentence = [ w[1:] if w.startswith('-') else w for w in sentence ] if stops: sentence = [w for w in sentence if w not in stops] sentence = [w for w in sentence if len(w) > 1] # rm short words if sentence: sentence = sentence if lemmatize: sentence = lemmatizer.lemmatize(sentence) if sentence and language == 'latin': sentence = [jv_replacer.replace(word) for word in sentence] if sentence: yield sentence