def load_docs(): bloblist = [] pairlist = [] for school_name in listdir('data'): f = open('data/'+school_name, 'r') blob = tb(' '.join(f.readlines())) bloblist.append(blob) pairlist.append([school_name, blob]) f.close() return [bloblist, pairlist]
def run_tfidf(document_list): documentlist = [tb(document) for document in document_list] important_words = [] for i, document in enumerate(documentlist): print "Top words in document %s" % str(int(i) + 1) scores = dict((word, tfidf(word, document, documentlist)) for word in document.split(' ')) sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) for word, score in sorted_words[:5]: print "Word: %s, TF-IDF: %s" % (str(word), str(score)) important_words.extend(sorted_words[:5]) return important_words
def new_run_tfidf(document_list): documentlist = [tb(document) for document in document_list] document_dictionary = populate_document_dictionary(documentlist) idf_dictionary = populate_containing_dictionary(documentlist) important_word_freqs = [] for i, document in enumerate(documentlist): print "Top words in document %s" % str(int(i) + 1) scores = dict((word, new_tfidf(word,int(i),document_dictionary,idf_dictionary, documentlist)) for word in document.split(' ')) sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) for word, score in sorted_words[:5]: print "Word: %s, TF-IDF: %s" % (str(word), str(score)) important_word_freqs.extend(sorted_words[:5]) return important_word_freqs
return blob.words.count(word) / len(blob.words) def n_containing(word, bloblist): return sum(1 for blob in bloblist if word in blob) def idf(word, bloblist): return math.log(len(bloblist) / (1 + n_containing(word, bloblist))) def tfidf(word, blob, bloblist): return tf(word, blob) * idf(word, bloblist) document1 = tb("""Python is a 2000 made-for-TV horror movie directed by Richard Clabaugh. The film features several cult favorite actors, including William Zabka of The Karate Kid fame, Wil Wheaton, Casper Van Dien, Jenny McCarthy, Keith Coogan, Robert Englund (best known for his role as Freddy Krueger in the A Nightmare on Elm Street series of films), Dana Barron, David Bowe, and Sean Whalen. The film concerns a genetically engineered snake, a python, that escapes and unleashes itself on a small town. It includes the classic final girl scenario evident in films like Friday the 13th. It was filmed in Los Angeles, California and Malibu, California. Python was followed by two sequels: Python II (2002) and Boa vs. Python (2004), both also made-for-TV films.""") document2 = tb("""Python, from the Greek word (πύθων/πύθωνας), is a genus of nonvenomous pythons[2] found in Africa and Asia. Currently, 7 species are recognised.[2] A member of this genus, P. reticulatus, is among the longest snakes known.""") document3 = tb("""The Colt Python is a .357 Magnum caliber revolver formerly manufactured by Colt's Manufacturing Company of Hartford, Connecticut. It is sometimes referred to as a "Combat Magnum".[1] It was first introduced in 1955, the same year as Smith & Wesson's M29 .44 Magnum. The now discontinued Colt Python targeted the premium revolver market segment. Some firearm
0.6: "#AE70ED", 0.5: "#C79BF2", 0.4: "#CEA8F4", 0.3: "#DBBFF7", 0.2: "#E1CAF9", 0.1: "#EDDFFB", 0: "#F5EEFD", -0.1: "#FFF2F2", -0.2: "#FFECEC", -0.3: "#FFDFDF", -0.4: "#FFCECE", -0.5: "#FFBBBB", -0.6: "#FFA8A8", -0.7: "#FF9797", -0.8: "#FF8A8A", -0.9: "#FF7575", -1: "#FF4848" } tb(pos).words html = "" for word in tb(pos).words: try: html += '<font color="{0}">{1}</font>'.format(colors[round(tb(word).polarity,1)], word) except: html += '<font color="#F5EEFD">{0}</font>'.format(word) print html changes to file in github
def create_TextBlob(fname): with codecs.open(fname, encoding='ascii', errors='ignore') as file: #Read file in text= file.read() #Return the text as textBlob return tb(str(text))
print doc + ".txt" # ++++++++++++++++++++++++++ # Open and pre-process files # ++++++++++++++++++++++++++ with open("testabstract.txt", "r") as myfile: document4 = myfile.read().replace('\n', '') document4 = ''.join(i for i in document4 if not i.isdigit()) document4 = ''.join([ x for x in document4 if ((ord(x) == 32) or (47 <= ord(x) < 127)) ]) # remove non-ascii characters document4 = re.sub(r'\s\d+', ' ', document4) # substitute matching regexp space w/digit(s) document4 = re.sub(r'[^\x00-\xF5]', ' ', document4) # substitute matching regexp w/space document4 = document4.replace("'s", '').lower() #make lower case document4 = tb(document4) with open("testintro.txt", "r") as myfile: document5 = myfile.read().replace('\n', '') document5 = ''.join(i for i in document5 if not i.isdigit()) document5 = ''.join([ x for x in document5 if ((ord(x) == 32) or (47 <= ord(x) < 127)) ]) # remove non-ascii characters document5 = re.sub(r'\s\d+', ' ', document5) # substitute matching regexp space w/digit(s) document5 = re.sub(r'[^\x00-\xF5]', ' ', document5) # substitute matching regexp w/space document5 = document5.replace("'s", '').lower() document5 = tb(document5) with open("testconclusion.txt", "r") as myfile:
import math from text.blob import TextBlob as tb def tf(word, blob): return blob.words.count(word) / len(blob.words) def n_containing(word, bloblist): return sum(1 for blob in bloblist if word in blob) def idf(word, bloblist): return math.log(len(bloblist) / (1 + n_containing(word, bloblist))) def tfidf(word, blob, bloblist): return tf(word, blob) * idf(word, bloblist) document1 = tb(""" Dari Sayyar bin Salamah r.a., katanya: "Aku datang bersama ayahku kepada Abu Barzah Al Aslami, lalu ayahku bertanya kepadanya, "Bagaimana caranya Rasulullah saw. melakukan solat fardu?" Jawab Abu Barzah, "Nabi melakukan solat Zohor atau yang anda namakan solat 'Al Uulaa' (solat pertama) ialah ketika matahari tergelincir ke barat. Dan beliau solat Asar, ketika salah seorang kami kembali dari perjalanannya ke ujung kota, sedangkan matahari masih terasa panasnya. Dan aku (kata Sayyar) lupa ucapannya tentang solat Maghrib. Dan Nabi lebih suka mengundurkan solat Isyak yang anda namakan Al 'Atamah, dan beliau tidak menyukai tidur sebelum solat Isyak , dan bercakap-cakap sesudahnya. Dan selesai solat Subuh ketika seseorang telah mengenal orang duduk di samping, sedangkan Nabi membaca dalam solat itu sebanyak 60 sampai 100 ayat." """) document2 = tb(""" Diberitakan oleh Abu Hurairah r.a., bahawa Nabi saw. masuk ke dalam masjid, kemudian masuk pula seorang laki-laki lalu dia solat. Selesai solat, orang itu datang dan memberi salam kepada Nabi saw. Nabi menolak orang itu dan bersabda, "Ulangilah solat anda kembali kerana anda belum solat!" Maka diulanginya solatnya kembali, kemudian dia datang dan memberi salam kepada Nabi. Sabda Nabi, "Ulangilah solat anda kembali, kerana anda belum solat!" Sampai tiga kali Nabi saw. menyuruh orang itu demikian. Jawab orang itu, "Demi Allah, yang telah mengutus anda dengan yang hak. Aku belum tahu cara solat yang lebih baik dari itu. kerana itu ajarkanlah kepada ku." Jawab Nabi saw., "Apabila engkau berdiri hendak solat, ucapkanlah Takbir. Kemudian baca ayat-ayat Quran yang mudah bagi mu. Kemudian rukuk sehingga engkau tenang dalam rukuk itu, kemudian bangkit, berdiri lurus kembali. Sesudah itu sujud, sehingga engkau tenang dalam sujudmu itu. Sesudah itu bangkit, duduk, sehingga engkau tenang tenteram dalam dudukmu itu. Kemudian sujud, sehingga engkau tenang dalam sujudmu itu. Lakukanlah seperti itu setiap kamu solat." """) document3 = tb(""" Anas bin Malik r.a. menceritakan, bahawa Rasulullah saw. pada suatu kali mengenderai kuda. Rasulullah jatuh dari kuda itu sehingga luka rusuk beliau yang sebelah kanan. kerana itu Nabi solat sambil duduk dan kami pun solat di belakang beliau sambil duduk pula. Selesai solat beliau bersabda: "Sesungguhnya imam diadakan untuk diikut. kerana itu apabila imam itu berdiri, maka berdiri pulalah kamu. Apabila dia rukuk, rukuk pulalah, apabila dia bangkit, bangkit pulalah, dan apabila dia mengucapkan Sami'allahu liman hamidah, maka ucapkanlah Rabbana lakal hamd. Dan apabila dia solat berdiri, maka solat pulalah berdiri, dan apabila dia solat sambil duduk, maka solat pulalah kamu sekalian sambil duduk. " Kata Abu Abdullah, Humaidi berkata, bahawa sabda Rasulullah saw., 'Apabila dia solat sambil duduk, maka solat pulalah kamu sambil duduk, ialah sabda beliau ketika dia sakit dahulu. Sesudah itu beliau (Nabi saw.) pernah solat sambil duduk, sedang orang banyak di belakangnya berdiri. Tetapi beliau tidak menyuruh mereka duduk. Dari kedua keterangan ini tentu diambil yang terakhir dari perbuatan Nabi itu." """) bloblist = [document1, document2, document3] for i, blob in enumerate(bloblist): print("Top words in document {}".format(i + 1)) scores = {word: tfidf(word, blob, bloblist) for word in blob.words} sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) for word, score in sorted_words[:3]: print("Word: {}, TF-IDF: {}".format(word, round(score, 5)))