Exemplo n.º 1
0
class TemplateBank:
    def __init__(self, title_bank):
        self.markov = MarkovChain(3)
        for item in title_bank.values():
            self.markov.add(item['title'].replace('—', '-'))

    def _random_template(self):
        title = self.markov.generate()

        replacements = {}
        tokens = []
        doc = nlp(title)

        i = 0
        for token in doc:
            # Consider named entities as single token.
            if token.ent_type_ in ('PERSON', 'FAC', 'GPE', 'LOC'):
                if token.ent_iob == 1:
                    tokens[-2] += tokens[-1] + token.text
                    tokens[-1] = token.whitespace_
                else:
                    tokens.append(token.text)
                    tokens.append(token.whitespace_)
                    replacements[i] = '[[PERSON]]' if token.ent_type_ == 'PERSON' else '[[LOC]]'
                    i += 2
                continue

            tokens.append(token.text)
            tokens.append(token.whitespace_)
            if token.tag_ in ("NN", "NNP"):
                replacements[i] = "[[NOUN]]"
            elif token.tag_ in ("NNS", "NNPS"):
                replacements[i] = "[[NOUNS]]"
            elif token.pos_ == "ADJ":
                replacements[i] = "[[ADJ]]"
            i += 2

        if len(replacements) < 2:
            return None

        logger.debug('generated title: ' + ''.join(tokens))

        # Create a template by replacing two random tokens with POS tags
        for i, replacement in random.sample(replacements.items(), 2):
            tokens[i] = replacement

        logger.debug('generated template: ' + ''.join(tokens))

        return tokens

    def random_template(self):
        """Get random template from the bank."""
        for i in range(0, 25):
            template = self._random_template()
            if template is not None:
                return template
        raise RecursionError("Title generation was unable to find fitting template.")
Exemplo n.º 2
0
def run_dir(base_path, authors_path):
    text = ''
    path = base_path + authors_path + '/'
    files = [name for name in os.listdir(path) if '.txt' in name]

    for f in files:
        with open(path + f, 'r') as f:
            text += f.read()

    # special treatment for wittgenstein formulas
    text = re.sub(r'“(.+?)”', '', text)

    markov = MarkovChain(text=text)
    bipolar_discourse = markov.generate(100)
    print repr(bipolar_discourse)
    save(authors_path + '.txt', bipolar_discourse)