class TemplateBank: def __init__(self, title_bank): self.markov = MarkovChain(3) for item in title_bank.values(): self.markov.add(item['title'].replace('—', '-')) def _random_template(self): title = self.markov.generate() replacements = {} tokens = [] doc = nlp(title) i = 0 for token in doc: # Consider named entities as single token. if token.ent_type_ in ('PERSON', 'FAC', 'GPE', 'LOC'): if token.ent_iob == 1: tokens[-2] += tokens[-1] + token.text tokens[-1] = token.whitespace_ else: tokens.append(token.text) tokens.append(token.whitespace_) replacements[i] = '[[PERSON]]' if token.ent_type_ == 'PERSON' else '[[LOC]]' i += 2 continue tokens.append(token.text) tokens.append(token.whitespace_) if token.tag_ in ("NN", "NNP"): replacements[i] = "[[NOUN]]" elif token.tag_ in ("NNS", "NNPS"): replacements[i] = "[[NOUNS]]" elif token.pos_ == "ADJ": replacements[i] = "[[ADJ]]" i += 2 if len(replacements) < 2: return None logger.debug('generated title: ' + ''.join(tokens)) # Create a template by replacing two random tokens with POS tags for i, replacement in random.sample(replacements.items(), 2): tokens[i] = replacement logger.debug('generated template: ' + ''.join(tokens)) return tokens def random_template(self): """Get random template from the bank.""" for i in range(0, 25): template = self._random_template() if template is not None: return template raise RecursionError("Title generation was unable to find fitting template.")
def run_dir(base_path, authors_path): text = '' path = base_path + authors_path + '/' files = [name for name in os.listdir(path) if '.txt' in name] for f in files: with open(path + f, 'r') as f: text += f.read() # special treatment for wittgenstein formulas text = re.sub(r'“(.+?)”', '', text) markov = MarkovChain(text=text) bipolar_discourse = markov.generate(100) print repr(bipolar_discourse) save(authors_path + '.txt', bipolar_discourse)