def all_words(lyrics): for l in lyrics: for w in nltk.word_tokenize(" ".join(l["lyrics"])): # print w, unigram_tagger.tag([w]) if l.get("title") and (not is_stopword(w)) and valid( unigram_tagger.tag([w])[0][1]): yield w.lower()
def add_info(lyric, text_col, i=0): def prep_word(w): return w.lower() a = dict(lyric) a["lyrics"] = Text(w.lower() for w in nltk.word_tokenize(" ".join(a["lyrics"]).encode("utf-8"))) a["terms"] = set(w for w in a["lyrics"] if not is_stopword(w) and valid(w)) a["tf_idf"] = dict(sorted( ( (t, text_col.tf_idf(t, a["lyrics"])) for t in a["terms"] ), key=lambda x : x[1]) ) i += 1 print i return a
def add_info(lyric, text_col, i=0): def prep_word(w): return w.lower() a = dict(lyric) a["lyrics"] = Text( w.lower() for w in nltk.word_tokenize(" ".join(a["lyrics"]).encode("utf-8"))) a["terms"] = set(w for w in a["lyrics"] if not is_stopword(w) and valid(w)) a["tf_idf"] = dict( sorted(((t, text_col.tf_idf(t, a["lyrics"])) for t in a["terms"]), key=lambda x: x[1])) i += 1 print i return a
def add_info(lyric, text_col, i=0): def prep_word(w): return w.lower() a = dict(lyric) a["lyrics"] = Text(w.lower() for w in nltk.word_tokenize(" ".join(a["lyrics"]).encode("utf-8"))) a["terms"] = set(w for w in a["lyrics"] if not is_stopword(w) and valid(w)) a["tf_idf"] = dict(sorted( ( (t, text_col.tf_idf(t, a["lyrics"])) for t in a["terms"] ), key=lambda x : x[1]) ) i += 1 print i return a ts = TextCollection([Text((w.lower() for w in nltk.word_tokenize(" ".join(l["lyrics"]).encode('utf-8')) if not is_stopword(w) and valid(w) ) , name = l["id"]) for l in lyrics if l.get("id")]) #lyrics = map(lambda x : add_info(x[1], ts, x[0]), ((i, l) for i, l in enumerate(lyrics) if l.get("id"))) #with open("withtfidf.pickle", "w") as f: # pickle.dump(lyrics, f) with open("withtfidf.pickle", "r") as f: lyrics = pickle.load(f) lyrics = dict((l["id"], l) for l in lyrics) def similar_lyrics(lyric): for l in lyrics.values():
def all_words(lyrics): for l in lyrics: for w in nltk.word_tokenize(" ".join(l["lyrics"])): # print w, unigram_tagger.tag([w]) if l.get("title") and (not is_stopword(w)) and valid(unigram_tagger.tag([w])[0][1]): yield w.lower()
w.lower() for w in nltk.word_tokenize(" ".join(a["lyrics"]).encode("utf-8"))) a["terms"] = set(w for w in a["lyrics"] if not is_stopword(w) and valid(w)) a["tf_idf"] = dict( sorted(((t, text_col.tf_idf(t, a["lyrics"])) for t in a["terms"]), key=lambda x: x[1])) i += 1 print i return a ts = TextCollection([ Text((w.lower() for w in nltk.word_tokenize(" ".join(l["lyrics"]).encode('utf-8')) if not is_stopword(w) and valid(w)), name=l["id"]) for l in lyrics if l.get("id") ]) #lyrics = map(lambda x : add_info(x[1], ts, x[0]), ((i, l) for i, l in enumerate(lyrics) if l.get("id"))) #with open("withtfidf.pickle", "w") as f: # pickle.dump(lyrics, f) with open("withtfidf.pickle", "r") as f: lyrics = pickle.load(f) lyrics = dict((l["id"], l) for l in lyrics) def similar_lyrics(lyric):