Exemplo n.º 1
0
    def markov_url(targets, dependencies):

        person = open("config/person.json", 'r')
        config = json.load(person)
        split = config["split"]

        flag = False
        time = 0
        day = ''

        data = []
        log = csv.reader(open("logs/out/url.csv"))
        for line in log:
            if flag:
                newTime = parseTime(line[1])
                if day != '' and day != line[0]:
                    data[-1].append(line[2])
                    day = line[0]
                elif newTime > split + time:
                    data.append([line[2]])

                else:
                    data[-1].append(line[2])
                time = newTime
            else:
                flag = True
        model = markovify.Chain(data, 3)
        path = model.walk()
        with open(targets[0], 'w') as output:
            for item in path:
                output.write(item + '\n')
Exemplo n.º 2
0
    def __init__(self,
                 input_text=None,
                 state_size=constants.DEFAULT_NGRAM_SIZE,
                 chain=None,
                 parsed_sentences=None):
        """
        :param input_text: DISABLED, do not pass this. instead, pass parsed_sentences.
        :param ngram_size: the N in N-gram, AKA state size or window size, same as elsewhere
        :param chain:  A trained markovify.Chain instance for this text, if pre-processed.
        :param parsed_sentences:  A list of lists i.e. [ [word, word, ...], [word, word, ...], ... ]
            Assumption - these should be sentence-tokenized & word-tokenized before passing to here.
            in text_makers module there will be a wrapper that does just that.
        """
        # NOTE: not calling super(); markovify.Text constructor does some things we don't want to do.
        # Overriding, satisfying same needs, but adapting to our purposes

        if input_text:
            raise Disabled(
                "disabled in this adapter; tokenize beforehand, pass to `parsed_sentences` in constructor"
            )

        self.state_size = state_size
        self.parsed_sentences = parsed_sentences

        self.chain = chain or markovify.Chain(self.parsed_sentences,
                                              state_size)

        # The "rejoined_text" variable is checked in make_sentences -> test_sentence_output, which
        # "assesses the novelty of sentences". This is a very cool feature, but so far it depends on the
        # 'eager' stringification that we are trying to get away from. For now, we'll disable it.
        self.rejoined_text = u'<DISABLED>'
    def __init__(self, do_markovify=True):
        print("tagging the datasets and markovifying them ... please wait!")
        # print(list(brown.tagged_sents()))
        # print(list(nps_chat.tagged_words()))
        # with open("reddit_apple_android.txt", "w") as text_file:
        #     self.tagged_sents = list(nltk.pos_tag(sent) for sent in (text_file.sents('reddit_apple_android.txt')))

        self.tagged_sents = list(brown.tagged_sents())
        # self.tagged_sents = list(treebank.tagged_sents())
        # self.tagged_sents = list(nltk.pos_tag(sent) for sent in (gutenberg.sents('austen-emma.txt')))
        # self.tagged_sents = list(nltk.pos_tag(sent) for sent in (gutenberg.sents('quora.txt')))
        # self.tagged_sents = list(nltk.pos_tag(sent) for sent in (gutenberg.sents('reddit_apple_android.txt')))
        # self.tagged_sents = list(nltk.pos_tag(sent) for sent in (gutenberg.sents('hackernews.txt')))
        self.tagged_sents.append(list(treebank.tagged_sents()))
        # self.tagged_sents.append(list(nps_chat.tagged_words()))
        # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('austen-emma.txt'))))
        # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('chesterton-brown.txt'))))
        # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('austen-persuasion.txt'))))
        # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('austen-sense.txt'))))
        # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('reddit_apple_android.txt'))))
        # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (genesis.sents('english-web.txt'))))
        # self.tagged_sents.append(list(nltk.pos_tag(gutenberg.sents('austen-persuasion.txt'))))
        # self.tagged_sents.append(list(nltk.pos_tag(gutenberg.sents('austen-sense.txt'))))
        # self.tagged_sents.append(list(nltk.pos_tag(genesis.sents('english-web.txt'))))
        # self.tagged_sents.append(list(genesis.tagged_words()))
        # self.tagged_sents.append(list(snowball_data.tagged_words()))

        # print(self.tagged_sents)
        if do_markovify:
            self.model = markovify.Chain(self.tagged_sents, 2)
Exemplo n.º 4
0
    def test_chain_update(self):
        chain = markovify.Chain([["foo", "bar"]], state_size=1)
        assert len(chain.model.keys()) == 3
        assert "testing" not in chain.begin_choices

        chain.update([["testing", "testing"]])
        assert len(chain.model.keys()) == 4
        assert "testing" in chain.begin_choices
Exemplo n.º 5
0
    def __init__(self, do_markovify=True):
        """

        :param do_markovify:
        """
        self.tagged_sents = list(brown.tagged_sents())
        if do_markovify:
            self.model = markovify.Chain(self.tagged_sents, 2)
Exemplo n.º 6
0
def slovodel_config(tmpdir):
    path = tmpdir.mkdir("sub")
    file_noun = path.join("noun.json")
    file_verb = path.join("verb.json")
    file_adjective = path.join("adjective.json")
    file_noun.write(markovify.Chain([["абвг"]], 1).to_json())
    file_verb.write(markovify.Chain([["вгде"]], 1).to_json())
    file_adjective.write(markovify.Chain([["дежз"]], 1).to_json())
    config = word_maker.Configuration(
        {
            word_maker.wordTypes.NOUN: file_noun,
            word_maker.wordTypes.VERB: file_verb,
            word_maker.wordTypes.ADJECTIVE: file_adjective,
        },
        db.Configuration("dummy", "dummy", 0, 0, None),
    )
    return config
Exemplo n.º 7
0
 def __init__(self, input_text, state_size=2, chain=None):
     """
     input_text: A string.
     state_size: An integer, indicating the number of words in the model's state.
     chain: A trained markovify.Chain instance for this text, if pre-processed.
     """
     runs = list(self.generate_corpus(input_text))
     # Rejoined text lets us assess the novelty of generated setences
     self.rejoined_text = self.sentence_join(map(self.word_join, runs))
     self.state_size = state_size        
     self.chain = chain or markovify.Chain(runs, state_size)
Exemplo n.º 8
0
    def __init__(self, input_text, state_size=2, chain=None):
        """
		input_text: A list of strings representing individual comments.
		state_size: An integer indicating the number of words in the model's state.
		chain: A trained markovify.Chain instance for this text, if pre-processed.
		"""
        if chain == None:
            runs = self.generate_corpus(input_text)

        self.input_text = input_text
        self.state_size = state_size
        self.chain = chain or markovify.Chain(runs, state_size)
Exemplo n.º 9
0
 def get_requests(self, appID, stripeSize, numStripes):
     currRequest = Request("req0", stripeSize, numStripes)
     currSize = stripeSize * numStripes
     reqList = [currRequest]
     if self.numPredictedRequests == 0:
         return reqList
     if appID not in self.appIDtoModelDict:
         self.appIDtoModelDict[appID] = {
             'mean': float(currSize),
             'std': 0.0,
             'numSamples': 1,
             'samples': [currSize]
         }
     else:
         self.appIDtoModelDict[appID]['numSamples'] += 1
         prevMean = self.appIDtoModelDict[appID]['mean']
         self.appIDtoModelDict[appID]['mean'] = (
             float(self.appIDtoModelDict[appID]['numSamples'] - 1) *
             prevMean +
             float(currSize)) / self.appIDtoModelDict[appID]['numSamples']
         prevVariance = math.pow(self.appIDtoModelDict[appID]['std'], 2)
         newVariance = (
             float(self.appIDtoModelDict[appID]['numSamples'] - 1) *
             prevVariance + (float(currSize) - prevMean) *
             (float(currSize) - self.appIDtoModelDict[appID]['mean'])
         ) / self.appIDtoModelDict[appID]['numSamples']
         self.appIDtoModelDict[appID]['std'] = math.sqrt(newVariance)
         self.appIDtoModelDict[appID]['samples'].append(currSize)
         if len(self.appIDtoModelDict[appID]
                ['samples']) > self.maxCorpusLength:
             self.appIDtoModelDict[appID][
                 'samples'] = self.appIDtoModelDict[appID]['samples'][1:]
         if self.appIDtoModelDict[appID][
                 'numSamples'] >= self.minCorpusLength:
             X = [
                 self.generate_training_samples(
                     self.appIDtoModelDict[appID]['samples'],
                     self.appIDtoModelDict[appID]['mean'],
                     self.appIDtoModelDict[appID]['std'])
             ]
             model = markovify.Chain(X, self.markovOrder)
             predictions = self.get_predictions(model, X[0],
                                                self.numPredictedRequests)
             predictedStripeCounts = self.convert_predictions_to_stripe_count(
                 predictions, self.appIDtoModelDict[appID]['mean'],
                 self.appIDtoModelDict[appID]['std'], stripeSize)
             for i in range(len(predictedStripeCounts)):
                 reqName = "req{0}".format(i + 1)
                 req = Request(reqName, stripeSize,
                               predictedStripeCounts[i])
                 reqList.append(req)
     return reqList
Exemplo n.º 10
0
    def test_entropy(self):
        model = { ('___BEGIN__',): {'0': 1.0, '1': 0.0 },
                 ('0',): { '0': 0.5, '1': 0.5 },
                 ('1',): { '0': 1.0, '1': 0. } }
        chain = markovify.Chain(None, state_size=1, model=model, finite=True)

        self.assertAlmostEqual(chain.entropy(('___BEGIN__',)), 0)
        self.assertAlmostEqual(chain.entropy(('0',)), 1.0)
        self.assertAlmostEqual(chain.entropy(('1',)), 0)

        # should have 10 zeros
        pw = chain.gen_entropy(10)
        assert 10 <= len([c for c in pw if c == '0']) <= 11
Exemplo n.º 11
0
 def test_select_most_frequent_follower_retrieves_unguessed_letter(self):
     chain = markovify.Chain([
         ['a', 'b'],
         ['a', 'b', 'a', 'b'],
         ['a', 'd'],
         ['b', 'a', 'd'],
         ['a', 'c'],
     ],
                             state_size=1)
     guesser = SingleStateMarkovGuesser(word_length=9,
                                        potential_words=['implosion'])
     guesser.incorrect_guesses = {'b'}
     guesser.alphabet = {'c'}
     guess = guesser._select_most_frequent_follower(chain, ('a', ))
     self.assertEqual(guess, 'c')
Exemplo n.º 12
0
def create_chats_newlinetext(chats: List[Dict[str, Any]],
                             state_size) -> Optional[markovify.NewlineText]:
    # Create a list with all messages
    messages: List[List[str]] = []
    # For chat in the list
    for chat in chats:
        # Find the chat name
        name: str = chat.get("name")
        if name is None:
            name = "Unknown"
        # For update in the chat
        with click.progressbar(chat["messages"],
                               label=name,
                               length=len(chat["messages"]),
                               show_percent=True,
                               fill_char="█",
                               empty_char="░") as updates_bar:
            for update in updates_bar:
                # Check that the update is not a service update
                if update["type"] != "message":
                    continue
                # Check that the sender is not a bot (null?)
                if update.get("from") is None:
                    continue
                # Find the message inside the update
                message: str = merge_message(update["text"])
                # Skip commands
                if message.startswith("/"):
                    continue
                # Split the message in words
                words: List[str] = message.split()
                # Append the words to the messages
                messages.append(words)
    # If the chat has no messages, return None
    if len(messages) == 0:
        return None
    # Create the chain from the words
    chain = markovify.Chain(messages, state_size=state_size)
    # Return the chain
    text = markovify.NewlineText(None, state_size=state_size, chain=chain)
    # Return the text
    return text
Exemplo n.º 13
0
    def __init__(self, input_text, state_size = 2, finite=False):
        runs = self.generate_corpus(input_text)

        self.chain = markovify.Chain(list(runs), state_size, finite=finite)
Exemplo n.º 14
0
 def __init__(self, word_length, potential_words, *args, **kwargs):
     super().__init__(word_length, potential_words, *args, **kwargs)
     self.markov_model_2 = markovify.Chain(self.potential_word_letters,
                                           state_size=2)
Exemplo n.º 15
0
def splitText(text):
    split_text = []
    for t in text:
        split_text.append("".join(t).split(" "))
    return split_text


if __name__ == "__main__":
    parser = ArgParser()
    args = parser.parse_args()
    
    fileNames = getFileNames(args.data_folder)
    text = splitText(loadData(args.data_folder, fileNames))

    # Create a list of markov chains for combination
    chains = [markovify.Chain(text, state_size=args.state_size) for t in text]

    # Combine all chains
    chain = markovify.combine(chains)

    # Generate stories
    stories = []
    init_state = tuple(args.init_state.split(" ")) if args.init_state else ()
    if args.init_state and len(init_state) != args.state_size:
        print("Length of init_state must be equal to state_size. Received length {} and state size {}".format(len(init_state), args.state_size))
        exit(1)
        
    for i in range(args.num_stories):
        if init_state:
            try:
                gen = [i for i in chain.gen(init_state=init_state)]
Exemplo n.º 16
0
        functions_duration[event['function']] = event['duration']

functions_with_rank = [
    '{0}_{1}'.format(event['function'], event['rank']) for event in events
]
output_file = open('data/output.csv', 'w+')
output_file.write('function, rank, start_time, duration')
for event in events:
    output_file.write('{0}, {1}, {2}, {3}\n'.format(
        event['function'], event['rank'],
        int(event['start_time'].timestamp()) * 1000000 +
        event['start_time'].microsecond,
        int(float(event['duration']) * 1000000)))

output_file.close()
model = markovify.Chain([functions_with_rank[20:-50]], PROCESS - 1)
model.compile()
tries = 0
status = []
state = ('___BEGIN__', ) * (PROCESS - 1)
for i in range(LENGTH):
    while True:
        next_state = model.move(state)
        if next_state != '___END__':
            break
    status.append(next_state)
    state = tuple(state[1:]) + (next_state, )

count = 1
start_times = [0] * PROCESS
function_name_index = {}
Exemplo n.º 17
0
 def test_bad_corpus(self):
     with self.assertRaises(Exception) as context:
         markovify.Chain(corpus="testing, testing", state_size=2)
Exemplo n.º 18
0
def analyzeURLS(log):
    flag = False
    time = 0
    split = 10
    
    data = []
    for line in log:
        if flag:
            newTime = parseTime(line[1])
            if newTime > split + time:
                data.append([line[2]])
            else:
                data[-1].append(line[2])
            time = newTime
        else:
            flag = True
    return data

data = analyzeURLS(csv.reader(open("url.log.txt"))) + analyzeURLS(csv.reader(open("url1.log.txt")))
model = markovify.Chain(data, 3)
path = model.walk()

code = open("arduino.txt", 'r').read()
output = open("output.ino", 'w')
urls = []
for item in path:
    urls.append("\"" + item + "\"")

output.write(code.format("{" + ", ".join(urls) + "}", len(path)))
Exemplo n.º 19
0
def train_HMM(corpus):
    """This function trains the HMM given a corpus of 'sentences'."""
    MC = markovify.Chain(corpus, 5)
    return MC
Exemplo n.º 20
0
import pandas as pd
import markovify
import format_lyrics

song_data = pd.read_csv('rapper/data/songs_and_lyrics.csv',
                        encoding="ISO-8859-1")
song_data['lyrics'] = [
    format_lyrics.format_lyrics(lyric) for lyric in song_data['lyrics']
]

song_model = markovify.Chain(song_data['lyrics'], state_size=2)
with open('rapper/data/billboard_100_bigram_model.json', 'w') as model:
    model.write(song_model.to_json())
Exemplo n.º 21
0
        predicted_id = tf.math.argmax(predictions, axis=1)[-1]
        
        # print(encode(tf.math.argmax(predictions, axis=1), revoc))
        
        start = tf.expand_dims([predicted_id], 0)

        result.append(revoc[predicted_id])
        # result += encode(predicted_ids, revoc)

    return (start_string + ''.join(result))

model = build_model(len(voc), bsize)1
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

i, more = 150, 450
model.fit(dataset, initial_epoch=i, epochs=i+more, callbacks=[checkpoint])

sbatch_model = build_model(len(voc), 1)
sbatch_model.load_weights(tf.train.latest_checkpoint('chpts'))
sbatch_model.build(tf.TensorShape([1, None]))

print(generate_text(sbatch_model, 'заратустра сказал', 1000))

corpus = list(map(lambda x: list(x + '.'), re.split(r'[\?\!\.…]+', zar)))

chain = markovify.Chain(corpus=corpus, state_size=5)
print(' '.join([''.join(chain.walk()) for _ in range(100)]))


Exemplo n.º 22
0
 def __init__(self, word_length, potential_words, *args, **kwargs):
     super().__init__(word_length, potential_words, *args, **kwargs)
     self.potential_word_letters = [list(word) for word in potential_words]
     self.markov_model_1 = markovify.Chain(self.potential_word_letters,
                                           state_size=1)
     self.alphabet = self._derive_alphabet(self.potential_words)
Exemplo n.º 23
0
mid = mido.MidiFile('song.mid')
notes = []
# outport = mido.open_output()
for msg in mid:
    print(msg)
    if (msg.type == 'note_on'):
        # outport.send(msg)
        notes.append((msg.note, round(msg.time, 2)))
    if (msg.type == 'note_off'):
        t = list(notes[len(notes) - 1])
        t[1] = round(msg.time, 2)
        notes[len(notes) - 1] = tuple(t)
print(notes)
# outport.close()

text_model = markovify.Chain([notes], state_size=4)
generated = text_model.walk()
print(generated)

pygame.midi.init()
player = pygame.midi.Output(0)
player.set_instrument(0)

for note, length in generated:
    player.note_on(note, 127)
    time.sleep(length * 2)
    player.note_off(note, 127)

del player
pygame.midi.quit()
def make_models(folder):
    markov_model = None
    with open(join(folder, "tag_children.txt")) as f:
        text = f.read()
        corpus = [line.split("^") for line in text.split("\n")]
        markov_model = markovify.Chain(corpus, 2)
    with open(join(folder, "structure_markov.pickle"), 'wb') as f:
        pickle.dump(markov_model, f)

    tags_parent_words_model = {}
    with open(join(folder, "tag_words.txt")) as f:
        text = f.read()
        tag_word_pairs = [
            line.split("^", 1) for line in text.split("\n") if "^" in line
        ]
        for tag, word in tag_word_pairs:
            if not tags_parent_words_model.get(tag):
                tags_parent_words_model[tag] = {}
            tags_parent_words_model[tag][
                word] = tags_parent_words_model[tag].get(word, 0) + 1
    with open(join(folder, "tag_words.pickle"), 'wb') as f:
        pickle.dump(tags_parent_words_model, f)

    tags_only_model = {}
    with open(join(folder, "tags_only.txt")) as f:
        text = f.read()
        tag_word_pairs = [
            line.split("^", 1) for line in text.split("\n") if "^" in line
        ]
        for tag, word in tag_word_pairs:
            if not tags_only_model.get(tag):
                tags_only_model[tag] = {}
            tags_only_model[tag][word] = tags_only_model[tag].get(word, 0) + 1
    with open(join(folder, "tags_only.pickle"), 'wb') as f:
        pickle.dump(tags_only_model, f)

    tags_parent_words_model = {}
    with open(join(folder, "tags_parent_words.txt")) as f:
        text = f.read()
        tag_word_pairs = [
            line.split("^", 1) for line in text.split("\n") if "^" in line
        ]
        for tag, word in tag_word_pairs:
            if not tags_parent_words_model.get(tag):
                tags_parent_words_model[tag] = {}
            tags_parent_words_model[tag][
                word] = tags_parent_words_model[tag].get(word, 0) + 1
    with open(join(folder, "tags_parent_words.pickle"), 'wb') as f:
        pickle.dump(tags_parent_words_model, f)

    tags_parent_words_lsiblings_model = {}
    with open(join(folder, "tags_parent_words_lsiblings.txt")) as f:
        text = f.read()
        tag_word_pairs = [
            line.split("^", 1) for line in text.split("\n") if "^" in line
        ]
        for tag, word in tag_word_pairs:
            if not tags_parent_words_lsiblings_model.get(tag):
                tags_parent_words_lsiblings_model[tag] = {}
            tags_parent_words_lsiblings_model[tag][
                word] = tags_parent_words_lsiblings_model[tag].get(word, 0) + 1
    with open(join(folder, "tags_parent_words_lsiblings.pickle"), 'wb') as f:
        pickle.dump(tags_parent_words_lsiblings_model, f)
Exemplo n.º 25
0
def build_model(pose_map: Dict[str, data.Pose], flows: List[List[str]],
                state_size: int) -> markovify.Chain:
    if not all(yogaflo.validate_flow(pose_map, flow) for flow in flows):
        raise ValueError("Invalid flow as input")
    return markovify.Chain(flows, state_size)
Exemplo n.º 26
0
def combine(a, b):
    if not a:
        m = b
    else:
        m = markovify.combine([a, b])

    return m


def fetch_comments(story):
    return [comment.get('text', '') for comment in kids(story)]


def munge_comment(comment):
    comment = re.sub('<[^>]+>', '', comment)
    comment = html.unescape(comment)
    return comment


if __name__ == "__main__":
    stories = fetch_stories('new')
    corpus = []
    for story in with_kids(stories):
        print(f"Story #{story} has comments")
        corpus += [munge_comment(c).split() for c in fetch_comments(story)]

    # corpus = [["A", "list", "of", "sentences"], ...]
    model = markovify.Chain(corpus, state_size=3)
    with open(f'hn_markov_{time.time()}.json', 'w') as f:
        f.write(model.to_json())