def pl_preprocessing(total_pl): train_data = [] train_y = [] NUM_PL = 8 D_WORD = 300 tf_idf = TFIDF(total_pl) pl_cnt, words = tf_idf.get_tfidf() # label l = 0 for field in total_pl: # print(field) for num, j in enumerate(field): m = get_pl_v(j, pl_cnt, NUM_PL, D_WORD) if len(m) == 2400: train_data.append(m) train_y.append(l) else: pass l += 1 # print(i) # print(t,s) return train_data, train_y
def testBaseFC(seedUrls, pLimit): print 'Content-Type: text/plain\n\n' mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.5 urlScoreThreshold = 0.4 options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs} #print urls_tokens #print title_tokens cleandocs = getTokenizedDocs(docs) pos = cleandocs #print len(pos) #print len(neg) #print pos mytfidf.buildModel(pos) #mytfidf.buildModel(cleandocs) #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens) #eventFC(myEventScorer, mytfidf, options) baseFC(mytfidf,options)
def testEventFC(seedUrls, pLimit, eventTree): print 'Content-Type: text/plain\n\n' mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() # Write the seedUrls to a file seedFile = 'addurls.txt' if os.path.isfile(seedFile): os.remove(seedFile) f = open(seedFile,"rw") f.write(seedUrls) f.close() # Write the Event Tree to file eventFile = 'event-details.txt' if os.path.isfile(eventFile): os.remove(eventFile) fw = open(eventFile,"rw") fw.write(eventTree) fw.close() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.5 urlScoreThreshold = 0.4 # set threshold so scorer knows when to print tree to file myEventScorer.set_threshold(pageScoreThreshold) options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs} cleandocs = getTokenizedDocs(docs) mytfidf.buildModel(cleandocs) eventFC(myEventScorer, mytfidf, options)
def testBaseFC(seedUrls, pLimit): mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() # Write the seedUrls to a file seedFile = 'addurls.txt' if os.path.isfile(seedFile): os.remove(seedFile) f = os.open(seedFile, os.O_CREAT|os.O_RDWR) os.write(f, seedUrls) os.close(f) docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.4 urlScoreThreshold = 0.4 options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs} #print urls_tokens #print title_tokens cleandocs = getTokenizedDocs(docs) pos = cleandocs #print len(pos) #print len(neg) #print pos mytfidf.buildModel(pos) #mytfidf.buildModel(cleandocs) #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens) #eventFC(myEventScorer, mytfidf, options) baseFC(mytfidf,options)
def testBaseFC(seedUrls, pLimit): print 'Content-Type: text/plain\n\n' mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.5 urlScoreThreshold = 0.4 options = { "num_pages": pagesLimit, "pageScoreThreshold": pageScoreThreshold, "urlScoreThreshold": urlScoreThreshold, "seeds": seedURLs } #print urls_tokens #print title_tokens cleandocs = getTokenizedDocs(docs) pos = cleandocs #print len(pos) #print len(neg) #print pos mytfidf.buildModel(pos) #mytfidf.buildModel(cleandocs) #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens) #eventFC(myEventScorer, mytfidf, options) baseFC(mytfidf, options)
def baseFC(crawlParams): seedURLs = crawlParams['seedURLs'] t = [(-1,p,-1,"") for p in seedURLs] priorityQueue = PriorityQueue(t) crawlParams["priorityQueue"]=priorityQueue mytfidf = TFIDF() mytfidf.buildModel(crawlParams['model'],crawlParams['No_Keywords']) #mytfidf.buildModel(crawlParams['seedURLs'],crawlParams['No_Keywords']) crawlParams['scorer']=mytfidf #crawler = Crawler(priorityQueue,scorer,options) crawler = Crawler(crawlParams) crawler.crawl() ''' f = open("base-logData.txt","w") furl = open("base-Output-URLs.txt","w") for p in crawler.relevantPages: f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n") furl.write(p.pageUrl[1].encode("utf-8")+","+str(p.estimatedScore)+"\n") ftext = open("base-webpages/"+str(p.pageId) + ".txt", "w") ftext.write(p.text.encode("utf-8")) ftext.close() f.close() furl.close() bres = evaluator.evaluateFC(crawler.relevantPages) writeEvaluation(bres,"base-evaluateData.txt") print sum(bres) print len(bres) ''' return crawler.relevantPages
def __init__(self, file_name): """Creates a search engine backed by PageRank and TF-IDF Args: file_name: path to xml files of wiki dump """ # build corpus from xml files self.corpus, self.links = build_corpus(file_name) self.tf_idf = TFIDF(self.corpus) print("TFIDF engine has started") self.reverse_index = {word: set(mapping.keys()) for word, mapping in self.tf_idf.tf_idf.items()} self.page_rank = PageRank(self.links, self.tf_idf.tf_idf) print("PageRank engine has started")
def ask_question(qs_input, top_k): """ Ask one question and generate response for tfidf, lm and cnn """ print("Question : %s" % qs_input) print("Top k : : %d" % top_k) random.seed(12345) retrieval_data_start_time = time.clock() questions, pred_questions, answers, pred_answers = Data.read_pred_data( "Data/pred_QA-pair.csv") # Build word --> sentence dictionary word_sentence_dict = Data.generate_word_sentence_dict(pred_questions) print("Retrieval Data Finished") retrieval_data_end_time = time.clock() print("Retrieval Data cost %f" % (retrieval_data_end_time - retrieval_data_start_time)) response_start_time = time.clock() lm = LM(questions, pred_questions, answers, pred_answers, word_sentence_dict) tfidf = TFIDF(questions, pred_questions, answers, pred_answers, word_sentence_dict) cnn = CNN(questions, pred_questions, answers, pred_answers, word_sentence_dict, isTrain=False) _, lm_response = lm.ask_response(qs_input, top_k=top_k) tfidf_response_id, tfidf_response = tfidf.ask_response(qs_input, top_k=top_k * 10) cnn_response = cnn.ask_response(qs_input, top_k, tfidf_response_id) for i in range(top_k): print("LM response %d: %s" % (i + 1, lm_response[i])) for i in range(top_k): print("TFIDF response %d: %s" % (i + 1, tfidf_response[i])) for i in range(top_k): print("CNN response %d: %s" % (i + 1, cnn_response[i])) print("Response Finished") response_end_time = time.clock() print("Response cost %f" % (response_end_time - response_start_time))
def baseFC(crawlParams): seedURLs = crawlParams['seedURLs'] t = [(-1,p,-1,"") for p in seedURLs] priorityQueue = PriorityQueue(t) crawlParams["priorityQueue"]=priorityQueue mytfidf = TFIDF() mytfidf.buildModel(crawlParams['model'],crawlParams['No_Keywords']) #mytfidf.buildModel(crawlParams['seedURLs'],crawlParams['No_Keywords']) crawlParams['scorer']=mytfidf #crawler = Crawler(priorityQueue,scorer,options) crawler = Crawler(crawlParams) crawler.crawl() return crawler.relevantPages
def baseFC_OneTargetVector(crawlParams): seedURLs = crawlParams['seedURLs'] t = [(-1, p, -1, "") for p in seedURLs] priorityQueue = PriorityQueue(t) crawlParams["priorityQueue"] = priorityQueue mytfidf = TFIDF() mytfidf.buildModel(crawlParams['model'], crawlParams['No_Keywords']) #mytfidf.buildModel(crawlParams['seedURLs'],crawlParams['No_Keywords']) crawlParams['scorer'] = mytfidf #crawler = Crawler(priorityQueue,scorer,options) crawler = Crawler(crawlParams) crawler.crawl() return crawler.relevantPages
def testEventFC(seedFile, pLimit): print 'Content-Type: text/plain\n\n' mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.5 urlScoreThreshold = 0.4 # set threshold so scorer knows when to print tree to file myEventScorer.set_threshold(pageScoreThreshold) options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs} cleandocs = getTokenizedDocs(docs) mytfidf.buildModel(cleandocs) eventFC(myEventScorer, mytfidf, options)
def get_dominating_words(context_dict, corpusdir) : tfidf = TFIDF(corpusdir) dominating = init_dict() cache = dict() for t in context_dict.keys() : contexts = context_dict[t] for c in contexts : curr_max = (None, -1) for tok in c : if tok == "-ENT-" : break if not cache.has_key(tok) : cache[tok] = tfidf.idf(tok) if cache[tok] > curr_max[1] : curr_max = (tok, cache[tok]) if curr_max[0] != None : dominating[t].append(curr_max[0]) return dominating
def get_similarity(self, matrix=None, langue=None): if langue == None: self.set_sparse_matrix(matrix) else: tfidf = TFIDF(matrix, langue) sparse_matrix = tfidf.get_sparse_matrix() self.set_sparse_matrix(sparse_matrix) if self.dr == False: dimensionality_reduction = int(round((len(matrix[0]) * 3 / 4))) self.set_dimensionality_reduction(dimensionality_reduction) permutation_matrix = self._get_permutation_matrix() signature_matrix = self._get_signature_matrix(permutation_matrix) similarity_matrix = self._get_similarity_matrix(signature_matrix) return similarity_matrix
class SearchEngine: """ SearchEngine determines certain search engine based on the user's choice and returns the score of query words. """ def __init__(self, file_name): """Creates a search engine backed by PageRank and TF-IDF Args: file_name: path to xml files of wiki dump """ # build corpus from xml files self.corpus, self.links = build_corpus(file_name) self.tf_idf = TFIDF(self.corpus) print("TFIDF engine has started") self.reverse_index = {word: set(mapping.keys()) for word, mapping in self.tf_idf.tf_idf.items()} self.page_rank = PageRank(self.links, self.tf_idf.tf_idf) print("PageRank engine has started") def search(self, query, mode, limit=10): """Sends `process_text(query)` to the search engines selected by `mode` and returns article titles and associated scores up to `limited`. Results are sorted by their scores in a descending order. Args: query: raw query string mode: 'TF-IDF|PageRank|smart' limit: int Returns: A list of tuples. Each tuple is a document title and score pair. """ keywords = process_text(query) # process a raw query string to a cleaner version, remove # all the punctuations and white spaces if mode == 'TF-IDF': return self.tf_idf.search(keywords, limit) elif mode == 'PageRank': return self.page_rank.search(keywords, limit) elif mode == 'smart': return self.smart_search(keywords, limit) raise ValueError('Undefined search mode') def smart_search(self, keywords, limit=None): """ Returns the score of certain query words based on TFIDF score and pagerank score. """ smart_scores = {} tf_idf = self.tf_idf.tf_idf page_rank = self.page_rank.page_rank for word in keywords: if word in self.reverse_index: for page in self.reverse_index[word]: if page not in smart_scores: smart_scores[page] = 0 smart_scores[page] += tf_idf[word][page] + page_rank[page] result = sorted(smart_scores.items(), key=lambda x: x[1], reverse=True) return result[:limit]
def get_dominating_words(context_dict, corpusdir): tfidf = TFIDF(corpusdir) dominating = init_dict() cache = dict() for t in context_dict.keys(): contexts = context_dict[t] for c in contexts: curr_max = (None, -1) for tok in c: if tok == "-ENT-": break if not cache.has_key(tok): cache[tok] = tfidf.idf(tok) if cache[tok] > curr_max[1]: curr_max = (tok, cache[tok]) if curr_max[0] != None: dominating[t].append(curr_max[0]) return dominating
def testEventFC(seedUrls, pLimit, eventTree): #print 'GIVEN TREE:' #print eventTree # Write the seedUrls to a file seedFile = 'addurls.txt' if os.path.isfile(seedFile): os.remove(seedFile) f = os.open(seedFile, os.O_CREAT | os.O_RDWR) os.write(f, seedUrls) os.close(f) # Write the Event Tree to file eventFile = 'event-details.txt' if os.path.isfile(eventFile): os.remove(eventFile) fw = os.open(eventFile, os.O_CREAT | os.O_RDWR) os.write(fw, eventTree.lower()) os.close(fw) mytfidf = TFIDF() # appears to work fine (called then exited) myEventScorer = EventScorer.EventScorer() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.4 urlScoreThreshold = 0.4 # set threshold so scorer knows when to print tree to file myEventScorer.set_threshold(pageScoreThreshold) options = { "num_pages": pagesLimit, "pageScoreThreshold": pageScoreThreshold, "urlScoreThreshold": urlScoreThreshold, "seeds": seedURLs } cleandocs = getTokenizedDocs(docs) #print 'cleandocs' #print cleandocs mytfidf.buildModel(cleandocs) eventFC(myEventScorer, mytfidf, options)
def testEventFC(seedFile, pLimit): print 'Content-Type: text/plain\n\n' mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.5 urlScoreThreshold = 0.4 # set threshold so scorer knows when to print tree to file myEventScorer.set_threshold(pageScoreThreshold) options = { "num_pages": pagesLimit, "pageScoreThreshold": pageScoreThreshold, "urlScoreThreshold": urlScoreThreshold, "seeds": seedURLs } cleandocs = getTokenizedDocs(docs) mytfidf.buildModel(cleandocs) eventFC(myEventScorer, mytfidf, options)
def kfcvkNN(mi, k=10): correct = [] tested = [] tot = 0 cor = 0 for i in range(k): #mark_test_set(mi, k, i) MessageFeatures.test_fold = i MessageFeatures.folds = k tf = TFIDF(mi, 3) tf.train1() tf.correct = 0 tf.correct = 0 c = 0 t = 0 for m in mi: if m.isTest(mi.num_msgs): cl = tf.get_class_kNN(m) #print(cl) if cl == m.newsgroupnum: c+=1 t+=1 print(tf.correct) correct.append(c) tested.append(t) tot+=t cor+=c print (1.0*cor/tot) pass
def part1(documentPath, maximumDocuments=0): startTime = time.time() print("Executing code for Part 1...\n") print("Extracting data from XML Document...") values = XMLParse(documentPath, maximumDocuments) print("Number of Documents: "+str(len(values))) extractionTime = round(time.time() - startTime, 3) print("Time: " + str(extractionTime) + " seconds") print("Removing stopwords and stemming...") for i in range(len(values)-1, -1, -1): if values[i].hasField('BODY'): values[i].setField('BODY',removeStopwords(values[i].getField("BODY"))) else: del values[i] removingTime = round(time.time() - startTime - extractionTime, 3) print("Time: " + str(removingTime) + " seconds") print("Creating list of all unique words in corpus...") uniqueWords = getUniqueWords(values) uniqueWordsTime = round(time.time() - startTime - extractionTime - removingTime, 3) print("Time: " + str(uniqueWordsTime) + " seconds") print("Computing TF, IDF, and TFIDF...") computedTFIDF = TFIDF(values, uniqueWords) idfTime = round(time.time() - startTime - extractionTime - removingTime - uniqueWordsTime, 3) print("Time: " + str(idfTime) + " seconds") print("Computing Cosine Similarity...") computedTFIDF.calculateCosineSimilarity() #computedTFIDF.printVal('sim', 19) cosineSimTime = round(time.time() - startTime - extractionTime - removingTime - uniqueWordsTime - idfTime, 3) print("Time: " + str(cosineSimTime) + " seconds") print('\nPart 1 Complete') print("Execution Time: " + str(round(time.time() - startTime, 3)) + " seconds\n") return computedTFIDF
def testEventFC(seedUrls, pLimit, eventTree): #print 'GIVEN TREE:' #print eventTree # Write the seedUrls to a file seedFile = 'addurls.txt' if os.path.isfile(seedFile): os.remove(seedFile) f = os.open(seedFile, os.O_CREAT|os.O_RDWR) os.write(f, seedUrls) os.close(f) # Write the Event Tree to file eventFile = 'event-details.txt' if os.path.isfile(eventFile): os.remove(eventFile) fw = os.open(eventFile, os.O_CREAT|os.O_RDWR) os.write(fw, eventTree.lower()) os.close(fw) mytfidf = TFIDF() # appears to work fine (called then exited) myEventScorer = EventScorer.EventScorer() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.4 urlScoreThreshold = 0.4 # set threshold so scorer knows when to print tree to file myEventScorer.set_threshold(pageScoreThreshold) options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs} cleandocs = getTokenizedDocs(docs) #print 'cleandocs' #print cleandocs mytfidf.buildModel(cleandocs) eventFC(myEventScorer, mytfidf, options)
def testBaseFC(seedUrls, pLimit): mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() # Write the seedUrls to a file seedFile = 'addurls.txt' if os.path.isfile(seedFile): os.remove(seedFile) f = os.open(seedFile, os.O_CREAT | os.O_RDWR) os.write(f, seedUrls) os.close(f) docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.4 urlScoreThreshold = 0.4 options = { "num_pages": pagesLimit, "pageScoreThreshold": pageScoreThreshold, "urlScoreThreshold": urlScoreThreshold, "seeds": seedURLs } #print urls_tokens #print title_tokens cleandocs = getTokenizedDocs(docs) pos = cleandocs #print len(pos) #print len(neg) #print pos mytfidf.buildModel(pos) #mytfidf.buildModel(cleandocs) #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens) #eventFC(myEventScorer, mytfidf, options) baseFC(mytfidf, options)
def testEventFC(seedUrls, pLimit, eventTree): print 'Content-Type: text/plain\n\n' mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() # Write the seedUrls to a file seedFile = 'addurls.txt' if os.path.isfile(seedFile): os.remove(seedFile) f = open(seedFile, "rw") f.write(seedUrls) f.close() # Write the Event Tree to file eventFile = 'event-details.txt' if os.path.isfile(eventFile): os.remove(eventFile) fw = open(eventFile, "rw") fw.write(eventTree) fw.close() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.5 urlScoreThreshold = 0.4 # set threshold so scorer knows when to print tree to file myEventScorer.set_threshold(pageScoreThreshold) options = { "num_pages": pagesLimit, "pageScoreThreshold": pageScoreThreshold, "urlScoreThreshold": urlScoreThreshold, "seeds": seedURLs } cleandocs = getTokenizedDocs(docs) mytfidf.buildModel(cleandocs) eventFC(myEventScorer, mytfidf, options)
def test(): mytfidf = TFIDF() docs = downloadRawDocs("typhoon_haiyan_SEED_URLs.txt") seedURLs = getSeedURLs("typhoon_haiyan_SEED_URLs.txt") pagesLimit = 1000 pageScoreThreshold = 0.5 urlScoreThreshold = 0.4 options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs} #print urls_tokens #print title_tokens cleandocs = getTokenizedDocs(docs) pos = cleandocs #print len(pos) #print len(neg) #print pos mytfidf.buildModel(pos) #mytfidf.buildModel(cleandocs) #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens) baseFC(mytfidf,options)
def get_keywords(self, pageText, count): mytfidf = TFIDF() tokenPageText = getTokenizedDocs([pageText]) token_bow = [mytfidf.doc2bow(doc) for doc in tokenPageText] mytfidf.buildVocabIndex(token_bow) selected = mytfidf.selectImportantWords_tf(count) wordsList = mytfidf.index.keys() selected_words = [wordsList[k[1]] for k in selected] return selected_words
def __init__(self, modelInstance): self.model = modelInstance features = [ cosine_similarity.CosineSimilarity(), n_gram_matching.NGramMatching(), sentiment_feature.SentimentFeature(), SVD.SVD(), TFIDF.TFIDF(), baseline_features.BaselineFeature(), cue_words.CueWords() ] self.features_train = np.hstack( [feature.read() for feature in features]) self.labels_train = DataSet(path="../FNC-1").get_labels() self.features_test = np.hstack( [feature.read('competition_test') for feature in features]) self.labels_test = DataSet(path="../FNC-1", name="competition_test").get_labels()
def main(): #########Input and Output########## #IMPORTANT! lilypath = 'lily' #IMPORTANT! Set your own lily path and stopWords stopWordspath = 'Chinese-stop-words.txt' #IMPORTANT! stopWords = codecs.open(stopWordspath, 'r', 'gbk') inputfile = {} outputfile = {} filenames = os.listdir(lilypath) cnt = 0 for filename in filenames: inputfile[cnt] = codecs.open(lilypath + '/' + filename, 'r', 'utf-8') outputfile[cnt] = open(filename, 'w+') cnt += 1 #############TFIDF############# TFIDF(inputfile, outputfile, stopWords, cnt) #The TFIDF algorithem for i in range(0, cnt): inputfile[i].close() outputfile[i].close() stopWords.close()
def tfidf(mi): MessageFeatures.test_fold = -1 tf = TFIDF(mi, 3) tf.train1() cj = 0 cj_count = 0 tf.correct = 0 cj = 0 cj_count = 0 tf.correct = 0 for m in mi: if cj_count >= 20: cj_count = 0 cj += 1 elif m.newsgroupnum == cj: cj_count += 1 c = tf.get_class_kNN(m) print(c) print(tf.correct) pass
def get_tfidf(): total_data = get_raw_pl() tf_idf = TFIDF(total_data) tfidf_scores, words = tf_idf.get_tfidf() return tfidf_scores
def process_document(): import nltk from pymongo import TEXT if 'corpus' in session: tagger = ner.SocketNER(host="localhost", port=8080) collection_name = session['corpus'] folder_name = session['folder_name'] destination_path = os.path.join(app.config['UPLOAD_FOLDER'], collection_name, folder_name) data_array = [] content_array = [] pageRankSummarizer = PageRankSummarizer() tfidf_parser = TFIDF() content_collection_name = collection_name + "_content" content_table = DBUtils().get_collection_obj(content_collection_name) new_collection = DBUtils().get_collection_obj(collection_name) for input_file_name in glob.glob(destination_path+"/*.txt"): dict_entities = {} file_content = [] no_of_entities = 0 file_line_content = filter(None, [re.sub(r'[^\x00-\x7F]+',' ', line.rstrip('\n\r')).strip() for line in open(input_file_name, 'r')]) for line in file_line_content: dict_line_entities = tagger.get_entities(line) file_content.append(tagger.tag_text(line)) for key, value in dict_line_entities.iteritems(): no_of_entities += len(value) if key in dict_entities: dict_entities[key] = list(set(dict_entities[key] + value)) else: dict_entities[key] = list(set(value)) list_entity_frequency = [] str_content = " ".join(file_content) content_table.insert([{"__content": x} for x in nltk.sent_tokenize(" ".join(file_line_content))]) value_template = "<{{type}}>{{value}}</{{type}}>" for entity_type,list_value in dict_entities.iteritems(): for value in list_value: value_str = value_template.replace("{{type}}",entity_type).replace("{{value}}",value) list_entity_frequency.append([value_str,str_content.count(value_str)]) dict_entities['__entity_frequency'] = list_entity_frequency dict_entities['__word_frequency'] = tfidf_parser.compute_word_frequency(file_line_content) blob_file_content = TextBlob(str_content) dict_entities['__document_length'] = len(re.findall(r'\w+', str_content)) dict_entities['__num_entities'] = no_of_entities dict_entities['__polarity'] = blob_file_content.sentiment.polarity dict_entities['__subjectivity'] = blob_file_content.sentiment.subjectivity dict_entities['__formatted_content'] = file_content dict_entities['__content'] = file_line_content # if(request.form['title'] != "?"): # selected_title_option = int(request.form['title']) # if(selected_title_option == 1): # dict_entities['TITLE'] = os.path.basename(input_file_name) # else: # dict_entities['TITLE'] = file_line_content[selected_title_option-1] dict_entities['SUMMARY'] = pageRankSummarizer.summarize(file_line_content, int(request.form['summary-lines'])) # dict_entities['SUMMARY'] = pageRankSummarizer.summarize(file_line_content, 2) dict_entities['ID'] = os.path.basename(input_file_name) dict_entities['__read_count'] = 0 new_collection.insert(dict_entities) content_table.create_index([('__content', TEXT)], default_language='english') ##Generate a table with the names of all the columns so that this can be referenced further.. ##Caution: Needs to be updated when ever a new entity type is created.. DBUtils().generate_keys_table(collection_name) #Set the session value.. session['token'] = collection_name # os.remove(os.path.join(app.config['UPLOAD_FOLDER'], collection_name)) return json.dumps({"success": True, "redirect": url_for('.visualize')}) return json.dumps({"success": False})
faq = pd.read_csv('../data/interim/faq-text-separated.csv', keep_default_na=False) test_questions = pd.read_csv('../data/test/test-questions.csv') features = ['Topic', 'Category', 'Department', 'question', 'answer'] test_topics = pd.read_excel( '../../../Inquire Boulder request data- detailed open and closed - for research purposes.xlsx' ) test_topics = test_topics[['Description', 'Topic']] test_topics = test_topics.rename(index=str, columns={ "Description": "test_question", "Topic": "match_topic" }) # # Evaluate KDTree on questions # kdtree = KDTREE(faq, features, 'KDTREE') # kdtree.evaluate(test_questions, 'questions') # # Evaluate Word2Vec on questions # w2v = W2V(faq, features, 'W2V') # w2v.evaluate(test_questions, 'questions') # w2v.evaluate(test_topics, 'topics') # Evaluate TFIDF on questions and Topics tfidf = TFIDF(faq, features, 'TFIDF') tfidf.evaluate(test_questions, 'questions') # tfidf.evaluate(test_topics, 'topics')
class MovieTensor: model = None db = None tfIdf = None def __init__(self, model): self.model = model self.db = DBConnect() self.tfIdf = TFIDF("", "", "_actor_") def getListAsString(self, moviesList): moviesListStr = str(moviesList) moviesListStr = moviesListStr.replace('[', '(') moviesListStr = moviesListStr.replace(']', ')') return moviesListStr def getTensor(self): if self.model == 1: yearsCountQuery = "select count(distinct year) from mlmovies" #movieActorsCountQuery = "select count(distinct movieid) from mlmovies where movieid in (6058,9818,5914,6097,7232,9443,7062,8929,4354,10059) " res = self.db.executeQuery(yearsCountQuery) countStr = res[0] countString = str(countStr) countString = self.tfIdf.getCount(countString) noOfDistinctYear = int(countString) # get the no of actors movieActorsCountQuery = "select count(*) from imdb_actor_info " #movieActorsCountQuery = "select count(distinct actorid) from imdb_actor_info where actorid in (17838,45899,61523,68671,96585,99457,128645,133985) " res = self.db.executeQuery(movieActorsCountQuery) countStr = res[0] countString = str(countStr) countString = self.tfIdf.getCount(countString) noOfActors = int(countString) # get the no of movies movieActorsCountQuery = "select count(*) from mlmovies " #movieActorsCountQuery = "select count(distinct movieid) from mlmovies where movieid in (6058,9818,5914,6097,7232,9443,7062,8929,4354,10059) " res = self.db.executeQuery(movieActorsCountQuery) countStr = res[0] countString = str(countStr) countString = self.tfIdf.getCount(countString) noOfMovies = int(countString) #noOfMovies = 2 # actorMovieYearTensor = np.ndarray( shape=(noOfActors,noOfMovies,noOfDistinctYear)) # for i in range(0,noOfActors): # for j in range(0,noOfMovies): # for k in range(0,noOfDistinctYear): # actorMovieYearTensor[i,j,k] = 0.0 # #print actorMovieYearTensor[i,j,k] #build movie indices movieIdVsIndex = {} movieIndexVsName = {} query = "select * from mlmovies order by movieid" #query = "select * from mlmovies where movieid in (6058,9818,5914,6097,7232,9443,7062,8929,4354,10059) order by movieid" movieIndex = 0 res = self.db.executeQuery(query) for movie in res: movieId = movie[0] movieName = movie[1] movieIdVsIndex[movieId] = movieIndex movieIndexVsName[movieIndex] = movieName movieIndex = movieIndex + 1 #build year indices yearVsIndex = {} yearIndexVsYear = {} q = "select distinct year from mlmovies order by year" res = self.db.executeQuery(q) yearIndex = 0 for yearRow in res: year = yearRow[0] yearVsIndex[str(year)] = yearIndex yearIndexVsYear[yearIndex] = year yearIndex = yearIndex + 1 actorMovieYearMatrix = np.zeros( (noOfActors, noOfMovies, noOfDistinctYear)) query = "select * from imdb_actor_info order by actorid " actors = self.db.executeQuery(query) actorIndex = 0 actorIdVsIndex = {} actorIndexVsName = {} for actor in actors: actorid = actor[0] actorName = actor[1] actorrelatedMoviesQ = "select * from movie_actor where actorid = " + str( actorid) actorrelatedMovies = self.db.executeQuery(actorrelatedMoviesQ) movieIds = [] for movie in actorrelatedMovies: movieIds.append(movie[0]) # we got the movies moviesQuery = "select * from mlmovies where movieid in " + self.getListAsString( movieIds) res = self.db.executeQuery(moviesQuery) for movieYear in res: movieid = movieYear[0] year = movieYear[2] #actorMovieYearTensor[actorIndex,movieIdVsIndex[movieid],yearVsIndex[str(year)]] = 1.0 actorMovieYearMatrix[actorIndex][movieIdVsIndex[movieid]][ yearVsIndex[str(year)]] = 1 actorIdVsIndex[actorid] = actorIndex actorIndexVsName[actorIndex] = actorName actorIndex = actorIndex + 1 actorMovieYearMatrix[0][0][0] = 1 actorMovieYearMatrix[1][1][1] = 1 actorMovieYearTensor = tl.tensor(actorMovieYearMatrix) decomposed = dec.parafac(actorMovieYearTensor, rank=5) semanticsActor = decomposed[0] semanticsMovie = decomposed[1] semanticsYear = decomposed[2] for i in range(0, semanticsActor.shape[1]): actorsRow = semanticsActor[:, i] mean = np.mean(actorsRow) print("ACTORS GROUPED UNDER LATENT SEMANTICS {0} ".format(i + 1)) for j in range(0, noOfActors): if (actorsRow[j] >= mean): print(actorIndexVsName[j]) for i in range(0, semanticsMovie.shape[1]): moviesRow = semanticsMovie[:, i] mean = np.mean(moviesRow) print("MOVIES GROUPED UNDER LATENT SEMANTICS {0}".format(i + 1)) for j in range(0, noOfMovies): if (moviesRow[j] >= mean): print(movieIndexVsName[j]) for i in range(0, semanticsYear.shape[1]): yearsRow = semanticsYear[:, i] mean = np.mean(yearsRow) print("YEARS GROUPED UNDER LATENT SEMANTICS {0}".format(i + 1)) for j in range(0, noOfDistinctYear): if (yearsRow[j] >= mean): print(yearIndexVsYear[j]) elif self.model == 2: noOfTags = 0 query = "select count(*) from genome_tags" count = self.db.executeQuery(query) countStr = self.tfIdf.getCount(str(count[0])) noOfTags = int(countStr) # get the no of movies movieActorsCountQuery = "select count(*) from mlmovies " res = self.db.executeQuery(movieActorsCountQuery) countStr = res[0] countString = str(countStr) countString = self.tfIdf.getCount(countString) noOfMovies = int(countString) q = "select count(distinct rating) from mlratings" res = self.db.executeQuery(q) countStr = res[0] countString = str(countStr) countString = self.tfIdf.getCount(countString) noOfRatings = int(countString) tagMovieRatingMatrix = np.zeros( (noOfTags, noOfMovies, noOfRatings)) #print tagMovieRatingTensor # build tag index query = "select * from genome_tags order by tagid" tags = self.db.executeQuery(query) tagIndex = 0 tagIdVsIndex = {} tagIndexVsName = {} for tag in tags: tagid = tag[0] tagName = tag[1] tagIdVsIndex[tagid] = tagIndex tagIndexVsName[tagIndex] = tagName tagIndex = tagIndex + 1 query = "select * from mlmovies order by movieid" movieIndex = 0 movieIdVsIndex = {} movieIndexVsName = {} movies = self.db.executeQuery(query) for movie in movies: movieid = movie[0] movieName = movie[1] movieIdVsIndex[movieid] = movieIndex movieIndexVsName[movieIndex] = movieName movieTagsQ = "select * from mltags where movieid = " + str( movieid) movieTags = self.db.executeQuery(movieTagsQ) movieTagsList = [] for movieTag in movieTags: movieTagsList.append(movieTag[2]) totalNoOfRatingsQ = "select count(*) from mlratings where movieid = " + str( movieid) res = self.db.executeQuery(totalNoOfRatingsQ) totalRatingsStr = self.tfIdf.getCount(str(res[0])) totalRatings = int(totalRatingsStr) sumQ = "select movieid, sum(rating) from mlratings where movieid = " + str( movieid) + " group by movieid" res = self.db.executeQuery(sumQ) sumRating = 0 for r in res: sumRating = sumRating + r[1] avgRating = float(sumRating) / totalRatings for tag in movieTagsList: tagIndex = tagIdVsIndex[tag] for i in range(1, noOfRatings + 1): if avgRating <= float(i): tagMovieRatingMatrix[tagIndex][movieIndex][i - 1] = 1 #print "setting one" movieIndex = movieIndex + 1 tagMovieRatingMatrix[0][0][0] = 1 tagMovieRatingMatrix[1][1][1] = 1 tagMovieRatingTensor = tl.tensor(tagMovieRatingMatrix) decomposed = dec.parafac(tagMovieRatingTensor, rank=5) semanticsTag = decomposed[0] semanticsMovie = decomposed[1] semanticsRating = decomposed[2] for i in range(0, semanticsTag.shape[1]): tagRows = semanticsTag[:, i] mean = np.mean(tagRows) print(" TAGS GROUPED UNDER LATENT SEMANTICS {0} ".format(i + 1)) for j in range(0, noOfTags): if (tagRows[j] >= mean): print(tagIndexVsName[j]) for i in range(0, semanticsMovie.shape[1]): movieRows = semanticsMovie[:, i] mean = np.mean(movieRows) print("MOVIES GROUPED UNDER LATENT SEMANTICS {0}".format(i + 1)) for j in range(0, noOfMovies): if (movieRows[j] >= mean): print(movieIndexVsName[j]) for i in range(0, semanticsRating.shape[1]): ratingRows = semanticsRating[:, i] mean = np.mean(ratingRows) print("RATINGS GROUPED UNDER LATENT SEMANTICS {0}".format(i + 1)) for j in range(0, noOfRatings): if (ratingRows[j] >= mean): print(j + 1)
def __init__(self, model): self.model = model self.db = DBConnect() self.tfIdf = TFIDF("", "", "_actor_")
def cnn_output(input_file_name, output_file_name, output_num, top_k): """ Generate cnn outputs """ random.seed(12345) retrieval_data_start_time = time.clock() questions, pred_questions, answers, pred_answers = Data.read_pred_data( input_file_name) # Build word --> sentence dictionary word_sentence_dict = Data.generate_word_sentence_dict(pred_questions) print("Retrieval Data Finished") retrieval_data_end_time = time.clock() print("Retrieval Data cost %f" % (retrieval_data_end_time - retrieval_data_start_time)) cnn_response_start_time = time.clock() tfidf = TFIDF(questions, pred_questions, answers, pred_answers, word_sentence_dict) cnn = CNN(questions, pred_questions, answers, pred_answers, word_sentence_dict, isTrain=False) if output_file_name.split(".")[-1] == "txt": output = open(output_file_name, "w") for i in range(output_num): qs_index = int(random.random() * len(questions)) qs_input = questions[qs_index].encode("utf-8") output.write("Question : %s\n" % qs_input) tfidf_response_id, tfidf_response = tfidf.ask_response( qs_input, top_k * 10) cnn_response = cnn.ask_response(qs_input, top_k, tfidf_response_id) for i in range(top_k): output.write("CNN response %d: %s\n" % (i + 1, cnn_response[i].encode("utf-8"))) output.write("\n") output.close() cnn_response_end_time = time.clock() print("CNN response cost %f" % (cnn_response_end_time - cnn_response_start_time)) if output_file_name.split(".")[-1] == "csv": with open( output_file_name, 'w', ) as csvfile: fieldnames = ['Question'] fieldnames.extend(["Reply " + str(i + 1) for i in range(top_k)]) fieldnames.append("Score") writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for i in range(output_num): dict = {"Score": ""} qs_index = int(random.random() * len(questions)) qs_input = questions[qs_index].encode("utf-8") dict["Question"] = qs_input tfidf_response_id, tfidf_response = tfidf.ask_response( qs_input, top_k * 10) cnn_response = cnn.ask_response(qs_input, top_k, tfidf_response_id) for i in range(min(top_k, len(cnn_response))): dict["Reply " + str(i + 1)] = cnn_response[i].encode("utf-8") writer.writerow(dict)
corpus = [] for text in new_df['content']: corpus.append(text) titles = [] for title in new_df["title"]: titles.append(str(title)) #labels_df starts at df[5000] so we're good on the matching of labels to content events = [] for event in labels_df["Event"][:1000]: events.append(str(event)) from TFIDF import TFIDF #creates TFIDF matrix TFIDF(corpus) ############################################################################## ###################KMEANS##################################################### ############################################################################## from sklearn.externals import joblib #Loads my pre-existing kmeans model #Saves the model you just made #joblib.dump(km, '700_No_Ngram.pkl') km = joblib.load("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/KMeans_Cluster_Models/350_no_Ngram.pkl") clusters = km.labels_.tolist() #Only to create a new kmeans model from sklearn.cluster import KMeans
def main(): command = sys.argv[1] no = int(sys.argv[2]) if command == "CP": if no == 1: tensor = MovieTensor(1) tensor.getTensor() elif no == 2: tensor = MovieTensor(2) tensor.getTensor() elif command == "SVD": allactormoviesdata = pandas.read_csv("movie-actor.csv") alltagsdata = pandas.read_csv("mltags.csv") allactormoviesdata['max_actor_rank'] = allactormoviesdata.groupby( ['movieid'])['actor_movie_rank'].transform(max) allactormoviesdata['min_actor_rank'] = allactormoviesdata.groupby( ['movieid'])['actor_movie_rank'].transform(min) allactormoviesdata['actor_rank_weightage'] = allactormoviesdata.apply( compute_actor_weightage, axis=1) # # print(allactormoviesdata) min_timestamp = pandas.to_datetime(min(alltagsdata['timestamp'])) max_timestamp = pandas.to_datetime(max(alltagsdata['timestamp'])) alltagsdata['timestamp_weightage'] = alltagsdata.apply( CalculateTimestampWeights, axis=1, args=(min_timestamp, max_timestamp)) mergeddata = allactormoviesdata[[ 'actorid', 'movieid', 'actor_rank_weightage' ]].merge(alltagsdata[['movieid', 'tagid', 'timestamp_weightage']], on='movieid') #print(mergeddata[mergeddata['actorid'].isin([878356,1860883,316365,128645])]) mergeddata['total_weightage'] = mergeddata.apply( aggregate_tf_weightages, axis=1) mergeddata['tag_weightage'] = mergeddata.groupby( ['actorid', 'tagid'])['total_weightage'].transform('sum') tfdata = mergeddata[['actorid', 'tagid', 'tag_weightage' ]].drop_duplicates(subset=['tagid', 'actorid']) tfdata['total_weightage_actor'] = tfdata.groupby( ['actorid'])['tag_weightage'].transform('sum') tfdata['tf'] = tfdata.apply(ComputeTF, axis=1) taglist = tfdata['tagid'].tolist() alltagsdata = pandas.read_csv("mltags.csv") alltagsdata = alltagsdata[alltagsdata['tagid'].isin(taglist)] #print(alltagsdata) allactormoviesdata = pandas.read_csv("movie-actor.csv") requiredtagsdata = alltagsdata.merge(allactormoviesdata, on='movieid') requiredtagsdata.drop_duplicates(subset=['tagid', 'actorid'], inplace=True) requiredtagsdata['actor_count'] = requiredtagsdata.groupby( 'tagid')['actorid'].transform('count') requiredtagsdata.drop_duplicates(subset=['tagid'], inplace=True) actordata = pandas.read_csv("imdb-actor-info.csv") total_actors = actordata.shape[0] requiredtagsdata['idf'] = requiredtagsdata.apply( ComputeIDF, axis=1, total_actors=total_actors) # # print(total_actors) # print(requiredtagsdata) tfidfdata = ProcessTFandIDFtoTFIDF(tfdata, requiredtagsdata[['tagid', 'idf']]) # print(tfdata) #tfidfdata = tfidfdata[tfidfdata['actorid'].isin([878356,1860883,316365,128645])] #print(tfidfdata) actor_tag_matrix = tfidfdata.pivot_table(index='actorid', columns='tagid', values='tfidf', fill_value=0) print "Actor Tag Matrix" print actor_tag_matrix tf = TFIDF("", 1, "_actor_") tf.calcMoviesVector()
from ExtractAbstract import ExtractAbstract from InformationContent import InformationContent from TFIDF import TFIDF as TFIDF from ClusterRelatedness import ClusterRelatedness from DimensionRelatedness import DimensionRelatedness from RelatednessGraph import RelatednessGraph if __name__ == "__main__": ''' PART 1 - 1 Calculate IC and TFIDF ''' IC = InformationContent("./source/ic.txt") #DEBUG: IC.printSortedList() TfIdf = TFIDF("./source/tfidf.txt") #DEBUG: TfIdf.printSortedList() ''' PART 1 - 2 Use IC and TFIDF to extract words from abstracts ''' Extractor = ExtractAbstract("./source/corpus5.csv", IC, TfIdf, 0.35, 0.3) #IC THReshold / TFIDF Threst ''' PART 2 Calculate Relatedness ''' # Finding Relatedness 1 - Find Vector Cluster # ClusterRelatedness = ClusterRelatedness("./source/vectors.txt", "./abstracts/", Extractor.fileNum) # Finding Relatedness 2 - Compare Word Pairs
def get_Dnn_model(total_pl): NUM_PL = 8 D_WORD = 300 tf_idf = TFIDF(total_pl) pl_cnt, words = tf_idf.get_tfidf() x, y = pl_preprocessing(total_pl, NUM_PL) x = np.array(x) y = np.array(y) ###### test 同 training data ####### X_train, X_test1, Y_train, y_test1 = train_test_split(x, y, test_size=0.2) data = CrossValidationFolds(X_train, Y_train, FOLDS) (X_train1, y_train1), (X_valid1, y_valid1) = data.split() ###### test 不同 training data ####### # data = CrossValidationFolds(x, y, FOLDS) # (X_train1, y_train1), (X_valid1, y_valid1) = data.split() # X_test1,y_test1 = load_pl('../new_Steeve_data/filter_Dice/can/') # X_test1 = np.array(X_test1) # y_test1 = np.array(y_test1) ##### testing data ###### # Tx = X_test1[0] # Ty = y_test1[0] # Tx = Tx.reshape([1,-1]) # print(Tx.shape) # print(X_test1.shape) ### 先前設置 FOLDS = 5 in_units = D_WORD * NUM_PL n_class = 6 # 題目要求只要辨識 0 ,1 ,2 ,3 及4 ,共5個類別 n_train = len(X_train1) # train資料的長度 batch_size = 50 n_batch = n_train // batch_size X = tf.placeholder(tf.float32, [None, in_units], name="X") # 初始化x資料型態為[None,784] y = tf.placeholder(tf.int64, shape=(None), name="y") # 初始化y資料型態[None] logits = L_layers_model(X, 128, n_class, 0.5) Y_proba = tf.nn.softmax(logits, name="Y_proba") loss, train_op = train_op(y, logits) accuracy, precision, recall = acc_model(y, logits) prediction = tf.argmax(Y_proba, 1) saver = tf.train.Saver() # call save function config = tf.ConfigProto(device_count={'GPU': 1}) #指定gpu # Params for Train epochs = 1000 # 10 for augmented training data, 20 for training data val_step = 100 # 當 50 步時去算一次驗證資料的正確率 # Training cycle max_acc = 0. # Save the maximum accuracy value for validation data early_stop_limit = 0 # 紀錄early_stop的值 init = tf.global_variables_initializer() init_l = tf.local_variables_initializer() with tf.Session(config=config) as sess: run(sess, X_train1, y_train1, X_valid1, y_valid1) sess.run(init_l) saver.restore(sess, '../dnn_model.ckpt') # 開啟剛剛 early_stop 的 model print('Acc_test :', sess.run(accuracy, feed_dict={ X: X_test1, y: y_test1 })) print('Prec_value :', sess.run(precision, feed_dict={ X: X_test1, y: y_test1 })) print('Recall_value :', sess.run(recall, feed_dict={ X: X_test1, y: y_test1 }))
corpus = [] for text in new_df['content']: corpus.append(text) titles = [] for title in new_df["title"]: titles.append(str(title)) #labels_df starts at df[5000] so we're good on the matching of labels to content events = [] for event in labels_df["Event"][:1000]: events.append(str(event)) import os os.chdir("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/Pre-Processing") from TFIDF import TFIDF tfidf_matrix = TFIDF(corpus) #################################################################### ##########################HAC####################################### #################################################################### from sklearn.cluster import AgglomerativeClustering from scipy.cluster.hierarchy import dendrogram hac = AgglomerativeClustering(n_clusters=500, affinity = "euclidean") dense_matrix = tfidf_matrix.todense() hac.fit_predict(dense_matrix) from sklearn.externals import joblib #Saves the model you just made joblib.dump(hac, '350_euc_HAC.pkl')