示例#1
0
def get_episode(episode_id):
    """ Show specific episode
    """
    statement = """\
    MATCH (e:Episode {id: {episodeId}})-[r:TOPIC]->(topic)
    WITH e, r, topic ORDER BY r.score DESC
    RETURN e.id AS id, e.title as title, e.season AS season, e.number AS number,
           COLLECT({id: topic.id, name: topic.value, score: r.score}) AS topics
    ORDER BY id
    """

    episode = graph.cypher.execute(statement,
                                   {"episodeId": int(episode_id)})[0]

    season = episode["season"]
    number = episode["number"]

    sentences = []
    with open("data/import/sentences.csv", "r") as sentences_file:
        reader = csv.reader(sentences_file, delimiter=",")
        reader.next()
        for row in reader:
            if int(row[1]) == int(episode['id']):
                tokenized_sentence = nltk.word_tokenize(row[4].decode('utf-8'))
                sentence_pos = nltk.pos_tag(tokenized_sentence)
                word_pos = [(word,
                             classifier.classify(
                                 pos_features(tokenized_sentence, sentence_pos,
                                              i)))
                            for i, word in enumerate(tokenized_sentence)]

                speaker = list(
                    itertools.takewhile(lambda x: x[1] == True, word_pos))

                sentences.append(("".join(s[0] for s in speaker), row[4]))

    transcript = open("data/transcripts/S%d-Ep%d" % (season, number)).read()
    soup = BeautifulSoup(transcript)
    rows = select(soup, "table.tablebg tr td.post-body div.postbody")

    return template("episode",
                    episode=episode,
                    transcript=rows[0],
                    sentences=sentences)
示例#2
0
def get_episode(episode_id):
    """ Show specific episode
    """
    statement = """\
    MATCH (e:Episode {id: {episodeId}})-[r:TOPIC]->(topic)
    WITH e, r, topic ORDER BY r.score DESC
    RETURN e.id AS id, e.title as title, e.season AS season, e.number AS number,
           COLLECT({id: topic.id, name: topic.value, score: r.score}) AS topics
    ORDER BY id
    """

    episode = graph.cypher.execute(statement, {"episodeId": int(episode_id)})[0]

    season = episode["season"]
    number = episode["number"]

    sentences = []
    with open("data/import/sentences.csv", "r") as sentences_file:
        reader = csv.reader(sentences_file, delimiter = ",")
        reader.next()
        for row in reader:
            if int(row[1]) == int(episode['id']):
                tokenized_sentence = nltk.word_tokenize(row[4].decode('utf-8'))
                sentence_pos = nltk.pos_tag(tokenized_sentence)
                word_pos = [(word, classifier.classify(pos_features(tokenized_sentence, sentence_pos, i)))
                             for i, word in enumerate(tokenized_sentence)]

                speaker = list(itertools.takewhile(lambda x: x[1] == True, word_pos))

                sentences.append(("".join(s[0] for s in speaker), row[4]))

    transcript = open("data/transcripts/S%d-Ep%d" %(season, number)).read()
    soup = BeautifulSoup(transcript)
    rows = select(soup, "table.tablebg tr td.post-body div.postbody")

    return template("episode", episode = episode, transcript = rows[0], sentences = sentences)
示例#3
0
import nltk
from nltk import ClassifierI

from himymutil.ml import pos_features

class NaiveClassifier(ClassifierI):
    def classify(self, featureset):
        if featureset['next-word'] == ":":
            return True
        else:
            return False

if __name__ == '__main__':
    classifier = NaiveClassifier()

    sentence = "Ted from 2030: Oh,we were bigfansofNewYork'sannualHalloweenparade.Idon'tmeantheonethattakesplaceHalloweennightintheVillage.ImeantheonethattakesplacethemorningofNovember1st,theAnnualPostHalloweenWalkofShameParade."
    tokenized_sentence = nltk.word_tokenize(sentence)
    for i, word in enumerate(tokenized_sentence):
        print "{0} -> {1}".format(word, classifier.classify(pos_features(tokenized_sentence, i)))
from sklearn.cross_validation import train_test_split

with open("data/import/trained_sentences.json", "r") as json_file:
    json_data = json.load(json_file)

tagged_sents = []
for sentence in json_data:
    tagged_sents.append([(word["word"], word["speaker"])
                         for word in sentence["words"]])

featuresets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    sentence_pos = nltk.pos_tag(untagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append((pos_features(untagged_sent, sentence_pos, i), tag))

from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer()
X = vec.fit_transform([item[0] for item in featuresets]).toarray()

#>> > len(X)
#>> > len(X[0])
#>> > vec.get_feature_names()[10:15]

vec = DictVectorizer()
X = vec.fit_transform([item[0] for item in featuresets]).toarray()
Y = [item[1] for item in featuresets]
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
示例#5
0
def extract_speaker(sentence):
    tokenized_sentence = nltk.word_tokenize(sentence)
    for i, word in enumerate(tokenized_sentence):
        classification = classifier.classify(
            pos_features(tokenized_sentence, i))
示例#6
0
def extract_speaker(sentence):
    tokenized_sentence = nltk.word_tokenize(sentence)
    for i, word in enumerate(tokenized_sentence):
        classification = classifier.classify(pos_features(tokenized_sentence, i))
示例#7
0
from himymutil.naive import NaiveClassifier
from himymutil.ml import pos_features, assess_classifier

with open("data/import/trained_sentences.json", "r") as json_file:
    json_data = json.load(json_file)

tagged_sents = []
for sentence in json_data:
    tagged_sents.append([(word["word"], word["speaker"]) for word in sentence["words"]])

featuresets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    sentence_pos = nltk.pos_tag(untagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append((pos_features(untagged_sent, sentence_pos, i), tag))

train_data, test_data = train_test_split(featuresets, test_size=0.20, train_size=0.80)

table = []
table.append(assess_classifier(NaiveClassifier(), test_data, "Naive"))
table.append(assess_classifier(nltk.NaiveBayesClassifier.train(train_data), test_data, "Naive Bayes"))
table.append(assess_classifier(nltk.DecisionTreeClassifier.train(train_data), test_data, "Decision Tree All In"))


def get_rid_of(entry, *keys):
    for key in keys:
        del entry[key]


tmp_train_data = copy.deepcopy(train_data)