示例#1
0
def pl_preprocessing(total_pl):
    train_data = []
    train_y = []
    NUM_PL = 8
    D_WORD = 300
    tf_idf = TFIDF(total_pl)
    pl_cnt, words = tf_idf.get_tfidf()

    # label
    l = 0

    for field in total_pl:
        #         print(field)
        for num, j in enumerate(field):

            m = get_pl_v(j, pl_cnt, NUM_PL, D_WORD)
            if len(m) == 2400:
                train_data.append(m)
                train_y.append(l)
            else:
                pass
        l += 1
#                             print(i)
#     print(t,s)

    return train_data, train_y
def testBaseFC(seedUrls, pLimit):
    print 'Content-Type: text/plain\n\n'
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()
    
    
    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.5
    urlScoreThreshold = 0.4
    options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs}
    #print urls_tokens
    #print title_tokens    
    
    cleandocs = getTokenizedDocs(docs)
    
    pos = cleandocs
    
    #print len(pos)
    #print len(neg)
    #print pos
    mytfidf.buildModel(pos)
    #mytfidf.buildModel(cleandocs)
    #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens)
    #eventFC(myEventScorer, mytfidf, options)
    baseFC(mytfidf,options)
def testEventFC(seedUrls, pLimit, eventTree):
    print 'Content-Type: text/plain\n\n'
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()
    
    
    # Write the seedUrls to a file
    seedFile = 'addurls.txt'
    if os.path.isfile(seedFile):
        os.remove(seedFile)
    f = open(seedFile,"rw")
    f.write(seedUrls)
    f.close()
    
    # Write the Event Tree to file
    eventFile = 'event-details.txt'
    if os.path.isfile(eventFile):
        os.remove(eventFile)
    fw = open(eventFile,"rw")
    fw.write(eventTree)
    fw.close()
    
    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.5
    urlScoreThreshold = 0.4
    # set threshold so scorer knows when to print tree to file
    myEventScorer.set_threshold(pageScoreThreshold)
    options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs}
    cleandocs = getTokenizedDocs(docs)
    mytfidf.buildModel(cleandocs)
    eventFC(myEventScorer, mytfidf, options)
def testBaseFC(seedUrls, pLimit):
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()
    
    # Write the seedUrls to a file
    seedFile = 'addurls.txt'
    if os.path.isfile(seedFile):
        os.remove(seedFile)
    f = os.open(seedFile, os.O_CREAT|os.O_RDWR)
    os.write(f, seedUrls)
    os.close(f)

    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.4
    urlScoreThreshold = 0.4
    options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs}
    #print urls_tokens
    #print title_tokens    
    
    cleandocs = getTokenizedDocs(docs)
    
    pos = cleandocs
    
    #print len(pos)
    #print len(neg)
    #print pos
    mytfidf.buildModel(pos)
    #mytfidf.buildModel(cleandocs)
    #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens)
    #eventFC(myEventScorer, mytfidf, options)
    baseFC(mytfidf,options)
示例#5
0
def testBaseFC(seedUrls, pLimit):
    print 'Content-Type: text/plain\n\n'
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()

    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.5
    urlScoreThreshold = 0.4
    options = {
        "num_pages": pagesLimit,
        "pageScoreThreshold": pageScoreThreshold,
        "urlScoreThreshold": urlScoreThreshold,
        "seeds": seedURLs
    }
    #print urls_tokens
    #print title_tokens

    cleandocs = getTokenizedDocs(docs)

    pos = cleandocs

    #print len(pos)
    #print len(neg)
    #print pos
    mytfidf.buildModel(pos)
    #mytfidf.buildModel(cleandocs)
    #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens)
    #eventFC(myEventScorer, mytfidf, options)
    baseFC(mytfidf, options)
def baseFC(crawlParams):
    seedURLs = crawlParams['seedURLs']
    t = [(-1,p,-1,"") for p in seedURLs]
    priorityQueue = PriorityQueue(t)
    
    crawlParams["priorityQueue"]=priorityQueue
    mytfidf = TFIDF()
    
    mytfidf.buildModel(crawlParams['model'],crawlParams['No_Keywords'])
    #mytfidf.buildModel(crawlParams['seedURLs'],crawlParams['No_Keywords'])
    crawlParams['scorer']=mytfidf
    
    #crawler = Crawler(priorityQueue,scorer,options)
    crawler = Crawler(crawlParams)
    crawler.crawl()

    '''
    f = open("base-logData.txt","w")
    furl = open("base-Output-URLs.txt","w")
    for p in crawler.relevantPages:
        f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n")
        furl.write(p.pageUrl[1].encode("utf-8")+","+str(p.estimatedScore)+"\n")
        ftext = open("base-webpages/"+str(p.pageId) + ".txt", "w")
        ftext.write(p.text.encode("utf-8"))
        ftext.close()
    f.close()
    furl.close()
    bres = evaluator.evaluateFC(crawler.relevantPages)
    writeEvaluation(bres,"base-evaluateData.txt")    
    print sum(bres)
    print len(bres)
    '''
    return crawler.relevantPages
示例#7
0
    def __init__(self, file_name):
        """Creates a search engine backed by PageRank and TF-IDF

        Args:
            file_name: path to xml files of wiki dump
        """
        # build corpus from xml files
        self.corpus, self.links = build_corpus(file_name)
        self.tf_idf = TFIDF(self.corpus)
        print("TFIDF engine has started")
        self.reverse_index = {word: set(mapping.keys())
                              for word, mapping in self.tf_idf.tf_idf.items()}
        self.page_rank = PageRank(self.links, self.tf_idf.tf_idf)
        print("PageRank engine has started")
示例#8
0
def ask_question(qs_input, top_k):
    """
    Ask one question and generate response for tfidf, lm and cnn
    """

    print("Question : %s" % qs_input)
    print("Top k : : %d" % top_k)

    random.seed(12345)
    retrieval_data_start_time = time.clock()
    questions, pred_questions, answers, pred_answers = Data.read_pred_data(
        "Data/pred_QA-pair.csv")
    # Build word --> sentence dictionary
    word_sentence_dict = Data.generate_word_sentence_dict(pred_questions)

    print("Retrieval Data Finished")

    retrieval_data_end_time = time.clock()
    print("Retrieval Data cost %f" %
          (retrieval_data_end_time - retrieval_data_start_time))

    response_start_time = time.clock()

    lm = LM(questions, pred_questions, answers, pred_answers,
            word_sentence_dict)
    tfidf = TFIDF(questions, pred_questions, answers, pred_answers,
                  word_sentence_dict)
    cnn = CNN(questions,
              pred_questions,
              answers,
              pred_answers,
              word_sentence_dict,
              isTrain=False)

    _, lm_response = lm.ask_response(qs_input, top_k=top_k)
    tfidf_response_id, tfidf_response = tfidf.ask_response(qs_input,
                                                           top_k=top_k * 10)
    cnn_response = cnn.ask_response(qs_input, top_k, tfidf_response_id)

    for i in range(top_k):
        print("LM response %d: %s" % (i + 1, lm_response[i]))
    for i in range(top_k):
        print("TFIDF response %d: %s" % (i + 1, tfidf_response[i]))
    for i in range(top_k):
        print("CNN response %d: %s" % (i + 1, cnn_response[i]))

    print("Response Finished")

    response_end_time = time.clock()
    print("Response cost %f" % (response_end_time - response_start_time))
def baseFC(crawlParams):
    seedURLs = crawlParams['seedURLs']
    t = [(-1,p,-1,"") for p in seedURLs]
    priorityQueue = PriorityQueue(t)
    
    crawlParams["priorityQueue"]=priorityQueue
    mytfidf = TFIDF()
    
    mytfidf.buildModel(crawlParams['model'],crawlParams['No_Keywords'])
    #mytfidf.buildModel(crawlParams['seedURLs'],crawlParams['No_Keywords'])
    crawlParams['scorer']=mytfidf
    
    #crawler = Crawler(priorityQueue,scorer,options)
    crawler = Crawler(crawlParams)
    crawler.crawl()
    return crawler.relevantPages
示例#10
0
def baseFC_OneTargetVector(crawlParams):
    seedURLs = crawlParams['seedURLs']
    t = [(-1, p, -1, "") for p in seedURLs]
    priorityQueue = PriorityQueue(t)

    crawlParams["priorityQueue"] = priorityQueue
    mytfidf = TFIDF()

    mytfidf.buildModel(crawlParams['model'], crawlParams['No_Keywords'])
    #mytfidf.buildModel(crawlParams['seedURLs'],crawlParams['No_Keywords'])
    crawlParams['scorer'] = mytfidf

    #crawler = Crawler(priorityQueue,scorer,options)
    crawler = Crawler(crawlParams)
    crawler.crawl()
    return crawler.relevantPages
def testEventFC(seedFile, pLimit):
    print 'Content-Type: text/plain\n\n'
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()
    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.5
    urlScoreThreshold = 0.4
    # set threshold so scorer knows when to print tree to file
    myEventScorer.set_threshold(pageScoreThreshold)
    options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs}
    cleandocs = getTokenizedDocs(docs)
    mytfidf.buildModel(cleandocs)
    eventFC(myEventScorer, mytfidf, options)
def get_dominating_words(context_dict, corpusdir) :
    tfidf = TFIDF(corpusdir)
    dominating = init_dict()
    cache = dict()
    for t in context_dict.keys() :
        contexts = context_dict[t]
        for c in contexts :
            curr_max = (None, -1)
            for tok in c :
                if tok == "-ENT-" :
                    break
                if not cache.has_key(tok) :
                    cache[tok] = tfidf.idf(tok)
                if cache[tok] > curr_max[1] :
                    curr_max = (tok, cache[tok])
            if curr_max[0] != None :
                dominating[t].append(curr_max[0])
    return dominating
示例#13
0
    def get_similarity(self, matrix=None, langue=None):

        if langue == None:
            self.set_sparse_matrix(matrix)

        else:
            tfidf = TFIDF(matrix, langue)
            sparse_matrix = tfidf.get_sparse_matrix()
            self.set_sparse_matrix(sparse_matrix)

        if self.dr == False:
            dimensionality_reduction = int(round((len(matrix[0]) * 3 / 4)))
            self.set_dimensionality_reduction(dimensionality_reduction)

        permutation_matrix = self._get_permutation_matrix()
        signature_matrix = self._get_signature_matrix(permutation_matrix)
        similarity_matrix = self._get_similarity_matrix(signature_matrix)
        return similarity_matrix
示例#14
0
class SearchEngine:
    """
    SearchEngine determines certain search engine based on the user's choice and returns the score of query words.
    """
    def __init__(self, file_name):
        """Creates a search engine backed by PageRank and TF-IDF

        Args:
            file_name: path to xml files of wiki dump
        """
        # build corpus from xml files
        self.corpus, self.links = build_corpus(file_name)
        self.tf_idf = TFIDF(self.corpus)
        print("TFIDF engine has started")
        self.reverse_index = {word: set(mapping.keys())
                              for word, mapping in self.tf_idf.tf_idf.items()}
        self.page_rank = PageRank(self.links, self.tf_idf.tf_idf)
        print("PageRank engine has started")

    def search(self, query, mode, limit=10):
        """Sends `process_text(query)` to the search engines selected by `mode` and returns article
            titles and associated scores up to `limited`. Results are sorted by their scores in
            a descending order.

        Args:
            query: raw query string
            mode: 'TF-IDF|PageRank|smart'
            limit: int

        Returns:
            A list of tuples. Each tuple is a document title and score pair.
        """
        keywords = process_text(query)  # process a raw query string to a cleaner version, remove
        # all the punctuations and white spaces
        if mode == 'TF-IDF':
            return self.tf_idf.search(keywords, limit)
        elif mode == 'PageRank':
            return self.page_rank.search(keywords, limit)
        elif mode == 'smart':
            return self.smart_search(keywords, limit)
        raise ValueError('Undefined search mode')

    def smart_search(self, keywords, limit=None):
        """
        Returns the score of certain query words based on TFIDF score and pagerank score. 
        """
        smart_scores = {}
        tf_idf = self.tf_idf.tf_idf
        page_rank = self.page_rank.page_rank
        for word in keywords:
            if word in self.reverse_index:
                for page in self.reverse_index[word]:
                    if page not in smart_scores:
                        smart_scores[page] = 0
                    smart_scores[page] += tf_idf[word][page] + page_rank[page]
        result = sorted(smart_scores.items(), key=lambda x: x[1], reverse=True)
        return result[:limit]
def get_dominating_words(context_dict, corpusdir):
    tfidf = TFIDF(corpusdir)
    dominating = init_dict()
    cache = dict()
    for t in context_dict.keys():
        contexts = context_dict[t]
        for c in contexts:
            curr_max = (None, -1)
            for tok in c:
                if tok == "-ENT-":
                    break
                if not cache.has_key(tok):
                    cache[tok] = tfidf.idf(tok)
                if cache[tok] > curr_max[1]:
                    curr_max = (tok, cache[tok])
            if curr_max[0] != None:
                dominating[t].append(curr_max[0])
    return dominating
示例#16
0
def testEventFC(seedUrls, pLimit, eventTree):
    #print 'GIVEN TREE:'
    #print eventTree
    # Write the seedUrls to a file
    seedFile = 'addurls.txt'
    if os.path.isfile(seedFile):
        os.remove(seedFile)
    f = os.open(seedFile, os.O_CREAT | os.O_RDWR)
    os.write(f, seedUrls)
    os.close(f)

    # Write the Event Tree to file
    eventFile = 'event-details.txt'
    if os.path.isfile(eventFile):
        os.remove(eventFile)
    fw = os.open(eventFile, os.O_CREAT | os.O_RDWR)
    os.write(fw, eventTree.lower())
    os.close(fw)

    mytfidf = TFIDF()  # appears to work fine (called then exited)

    myEventScorer = EventScorer.EventScorer()

    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs

    pagesLimit = pLimit
    pageScoreThreshold = 0.4
    urlScoreThreshold = 0.4
    # set threshold so scorer knows when to print tree to file
    myEventScorer.set_threshold(pageScoreThreshold)
    options = {
        "num_pages": pagesLimit,
        "pageScoreThreshold": pageScoreThreshold,
        "urlScoreThreshold": urlScoreThreshold,
        "seeds": seedURLs
    }
    cleandocs = getTokenizedDocs(docs)
    #print 'cleandocs'
    #print cleandocs
    mytfidf.buildModel(cleandocs)
    eventFC(myEventScorer, mytfidf, options)
def testEventFC(seedFile, pLimit):
    print 'Content-Type: text/plain\n\n'
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()
    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.5
    urlScoreThreshold = 0.4
    # set threshold so scorer knows when to print tree to file
    myEventScorer.set_threshold(pageScoreThreshold)
    options = {
        "num_pages": pagesLimit,
        "pageScoreThreshold": pageScoreThreshold,
        "urlScoreThreshold": urlScoreThreshold,
        "seeds": seedURLs
    }
    cleandocs = getTokenizedDocs(docs)
    mytfidf.buildModel(cleandocs)
    eventFC(myEventScorer, mytfidf, options)
def kfcvkNN(mi, k=10):
	correct = []
	tested = []
	tot = 0
	cor = 0
	for i in range(k):
		#mark_test_set(mi, k, i)
		MessageFeatures.test_fold = i
		MessageFeatures.folds = k
		
		tf = TFIDF(mi, 3)
		tf.train1()
		tf.correct = 0
		tf.correct = 0
		c = 0
		t = 0
		for m in mi:
			if m.isTest(mi.num_msgs):
				cl = tf.get_class_kNN(m)
				#print(cl)
				if cl == m.newsgroupnum:
					c+=1
				t+=1
		print(tf.correct)	
		correct.append(c)
		tested.append(t)
		tot+=t
		cor+=c
	print (1.0*cor/tot)
		
	pass
示例#19
0
def part1(documentPath, maximumDocuments=0):
  startTime = time.time()
  print("Executing code for Part 1...\n")

  print("Extracting data from XML Document...")
  values = XMLParse(documentPath, maximumDocuments)
  print("Number of Documents: "+str(len(values)))
  extractionTime = round(time.time() - startTime, 3)
  print("Time: " + str(extractionTime) + " seconds")

  print("Removing stopwords and stemming...")
  for i in range(len(values)-1, -1, -1):
    if values[i].hasField('BODY'):
      values[i].setField('BODY',removeStopwords(values[i].getField("BODY")))
    else:
      del values[i]
  removingTime = round(time.time() - startTime - extractionTime, 3)
  print("Time: " + str(removingTime) + " seconds")

  print("Creating list of all unique words in corpus...")
  uniqueWords = getUniqueWords(values)
  uniqueWordsTime = round(time.time() - startTime - extractionTime - removingTime, 3)
  print("Time: " + str(uniqueWordsTime) + " seconds")

  print("Computing TF, IDF, and TFIDF...")
  computedTFIDF = TFIDF(values, uniqueWords)
  idfTime = round(time.time() - startTime - extractionTime - removingTime - uniqueWordsTime, 3)
  print("Time: " + str(idfTime) + " seconds")

  print("Computing Cosine Similarity...")
  computedTFIDF.calculateCosineSimilarity()
  #computedTFIDF.printVal('sim', 19)
  cosineSimTime = round(time.time() - startTime - extractionTime - removingTime - uniqueWordsTime - idfTime, 3)
  print("Time: " + str(cosineSimTime) + " seconds")


  print('\nPart 1 Complete')
  print("Execution Time: " + str(round(time.time() - startTime, 3)) + " seconds\n")
  return computedTFIDF
def testEventFC(seedUrls, pLimit, eventTree):    
    #print 'GIVEN TREE:'
    #print eventTree
    # Write the seedUrls to a file
    seedFile = 'addurls.txt'
    if os.path.isfile(seedFile):
        os.remove(seedFile)
    f = os.open(seedFile, os.O_CREAT|os.O_RDWR)
    os.write(f, seedUrls)
    os.close(f)

    # Write the Event Tree to file
    eventFile = 'event-details.txt'
    if os.path.isfile(eventFile):
        os.remove(eventFile)
    fw = os.open(eventFile, os.O_CREAT|os.O_RDWR)
    os.write(fw, eventTree.lower())
    os.close(fw)

    mytfidf = TFIDF() # appears to work fine (called then exited)


    myEventScorer = EventScorer.EventScorer() 

    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs

    pagesLimit = pLimit
    pageScoreThreshold = 0.4
    urlScoreThreshold = 0.4
    # set threshold so scorer knows when to print tree to file
    myEventScorer.set_threshold(pageScoreThreshold)
    options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs}
    cleandocs = getTokenizedDocs(docs)
    #print 'cleandocs'
    #print cleandocs
    mytfidf.buildModel(cleandocs)
    eventFC(myEventScorer, mytfidf, options)
示例#21
0
def testBaseFC(seedUrls, pLimit):
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()

    # Write the seedUrls to a file
    seedFile = 'addurls.txt'
    if os.path.isfile(seedFile):
        os.remove(seedFile)
    f = os.open(seedFile, os.O_CREAT | os.O_RDWR)
    os.write(f, seedUrls)
    os.close(f)

    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.4
    urlScoreThreshold = 0.4
    options = {
        "num_pages": pagesLimit,
        "pageScoreThreshold": pageScoreThreshold,
        "urlScoreThreshold": urlScoreThreshold,
        "seeds": seedURLs
    }
    #print urls_tokens
    #print title_tokens

    cleandocs = getTokenizedDocs(docs)

    pos = cleandocs

    #print len(pos)
    #print len(neg)
    #print pos
    mytfidf.buildModel(pos)
    #mytfidf.buildModel(cleandocs)
    #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens)
    #eventFC(myEventScorer, mytfidf, options)
    baseFC(mytfidf, options)
示例#22
0
def testEventFC(seedUrls, pLimit, eventTree):
    print 'Content-Type: text/plain\n\n'
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()

    # Write the seedUrls to a file
    seedFile = 'addurls.txt'
    if os.path.isfile(seedFile):
        os.remove(seedFile)
    f = open(seedFile, "rw")
    f.write(seedUrls)
    f.close()

    # Write the Event Tree to file
    eventFile = 'event-details.txt'
    if os.path.isfile(eventFile):
        os.remove(eventFile)
    fw = open(eventFile, "rw")
    fw.write(eventTree)
    fw.close()

    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.5
    urlScoreThreshold = 0.4
    # set threshold so scorer knows when to print tree to file
    myEventScorer.set_threshold(pageScoreThreshold)
    options = {
        "num_pages": pagesLimit,
        "pageScoreThreshold": pageScoreThreshold,
        "urlScoreThreshold": urlScoreThreshold,
        "seeds": seedURLs
    }
    cleandocs = getTokenizedDocs(docs)
    mytfidf.buildModel(cleandocs)
    eventFC(myEventScorer, mytfidf, options)
示例#23
0
def test():
    mytfidf = TFIDF()
    docs = downloadRawDocs("typhoon_haiyan_SEED_URLs.txt")
    seedURLs = getSeedURLs("typhoon_haiyan_SEED_URLs.txt")
    pagesLimit = 1000
    pageScoreThreshold = 0.5
    urlScoreThreshold = 0.4
    options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs}
    #print urls_tokens
    #print title_tokens    
    
    cleandocs = getTokenizedDocs(docs)
    
    pos = cleandocs
    
    #print len(pos)
    #print len(neg)
    #print pos
    mytfidf.buildModel(pos)
    #mytfidf.buildModel(cleandocs)
    #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens)
    
    baseFC(mytfidf,options)
示例#24
0
 def get_keywords(self, pageText, count):
         mytfidf = TFIDF()
         tokenPageText = getTokenizedDocs([pageText])
         token_bow = [mytfidf.doc2bow(doc) for doc in tokenPageText]
         mytfidf.buildVocabIndex(token_bow)
         selected = mytfidf.selectImportantWords_tf(count)
         wordsList = mytfidf.index.keys()
         selected_words = [wordsList[k[1]] for k in selected]
         return selected_words
示例#25
0
 def __init__(self, modelInstance):
     self.model = modelInstance
     features = [
         cosine_similarity.CosineSimilarity(),
         n_gram_matching.NGramMatching(),
         sentiment_feature.SentimentFeature(),
         SVD.SVD(),
         TFIDF.TFIDF(),
         baseline_features.BaselineFeature(),
         cue_words.CueWords()
     ]
     self.features_train = np.hstack(
         [feature.read() for feature in features])
     self.labels_train = DataSet(path="../FNC-1").get_labels()
     self.features_test = np.hstack(
         [feature.read('competition_test') for feature in features])
     self.labels_test = DataSet(path="../FNC-1",
                                name="competition_test").get_labels()
示例#26
0
def main():
    #########Input and Output##########                                              #IMPORTANT!
    lilypath = 'lily'  #IMPORTANT! Set your own lily path and stopWords
    stopWordspath = 'Chinese-stop-words.txt'  #IMPORTANT!
    stopWords = codecs.open(stopWordspath, 'r', 'gbk')
    inputfile = {}
    outputfile = {}
    filenames = os.listdir(lilypath)
    cnt = 0
    for filename in filenames:
        inputfile[cnt] = codecs.open(lilypath + '/' + filename, 'r', 'utf-8')
        outputfile[cnt] = open(filename, 'w+')
        cnt += 1
    #############TFIDF#############
    TFIDF(inputfile, outputfile, stopWords, cnt)  #The TFIDF algorithem
    for i in range(0, cnt):
        inputfile[i].close()
        outputfile[i].close()
    stopWords.close()
def tfidf(mi):
	MessageFeatures.test_fold = -1
	tf = TFIDF(mi, 3)
	tf.train1()
	cj = 0
	cj_count = 0
	tf.correct = 0
	cj = 0
	cj_count = 0
	tf.correct = 0
	for m in mi:
		if cj_count >= 20:
			cj_count = 0
			cj += 1
		elif m.newsgroupnum == cj:
			cj_count += 1
			c = tf.get_class_kNN(m)
			print(c)
	print(tf.correct)	
	pass
def get_tfidf():
    total_data = get_raw_pl()
    tf_idf = TFIDF(total_data)
    tfidf_scores, words = tf_idf.get_tfidf()

    return tfidf_scores
def process_document():
	import nltk
	from pymongo import TEXT

	if 'corpus' in session:
		tagger = ner.SocketNER(host="localhost", port=8080)
		collection_name = session['corpus']
		folder_name = session['folder_name']
		destination_path = os.path.join(app.config['UPLOAD_FOLDER'], collection_name, folder_name)
		data_array = []
		content_array = []
		pageRankSummarizer = PageRankSummarizer()
		tfidf_parser = TFIDF()

		content_collection_name = collection_name + "_content"
		content_table = DBUtils().get_collection_obj(content_collection_name)
		new_collection = DBUtils().get_collection_obj(collection_name)
		
		for input_file_name in glob.glob(destination_path+"/*.txt"):
			dict_entities = {}
			file_content = []

			no_of_entities = 0

			file_line_content = filter(None, [re.sub(r'[^\x00-\x7F]+',' ', line.rstrip('\n\r')).strip() for line in open(input_file_name, 'r')])
			for line in file_line_content:
				dict_line_entities = tagger.get_entities(line)
				file_content.append(tagger.tag_text(line))
				
				for key, value in dict_line_entities.iteritems():
					no_of_entities += len(value)
					if key in dict_entities:
						dict_entities[key] = list(set(dict_entities[key] + value))
					else:
						dict_entities[key] = list(set(value))
			
			list_entity_frequency = []
			str_content = " ".join(file_content)
			content_table.insert([{"__content": x} for x in nltk.sent_tokenize(" ".join(file_line_content))])

			value_template = "<{{type}}>{{value}}</{{type}}>"
			for entity_type,list_value in dict_entities.iteritems():
				for value in list_value:
					value_str = value_template.replace("{{type}}",entity_type).replace("{{value}}",value)
					list_entity_frequency.append([value_str,str_content.count(value_str)])

			dict_entities['__entity_frequency'] = list_entity_frequency
			dict_entities['__word_frequency'] = tfidf_parser.compute_word_frequency(file_line_content)

			blob_file_content = TextBlob(str_content)
			dict_entities['__document_length'] = len(re.findall(r'\w+', str_content))
			dict_entities['__num_entities'] = no_of_entities
			dict_entities['__polarity'] =  blob_file_content.sentiment.polarity
			dict_entities['__subjectivity'] = blob_file_content.sentiment.subjectivity
			
			dict_entities['__formatted_content'] = file_content
			dict_entities['__content'] = file_line_content

			# if(request.form['title'] != "?"):
			# 	selected_title_option = int(request.form['title'])
			# 	if(selected_title_option == 1):
			# 		dict_entities['TITLE'] = os.path.basename(input_file_name)
			# 	else:
			# 		dict_entities['TITLE'] = file_line_content[selected_title_option-1]

			dict_entities['SUMMARY'] = pageRankSummarizer.summarize(file_line_content, int(request.form['summary-lines']))
			# dict_entities['SUMMARY'] = pageRankSummarizer.summarize(file_line_content, 2)
			dict_entities['ID'] = os.path.basename(input_file_name)
			dict_entities['__read_count'] = 0
			new_collection.insert(dict_entities)
			
		content_table.create_index([('__content', TEXT)], default_language='english')

		##Generate a table with the names of all the columns so that this can be referenced further..
		##Caution: Needs to be updated when ever a new entity type is created..
		DBUtils().generate_keys_table(collection_name)
		
		#Set the session value..
		session['token'] = collection_name
		# os.remove(os.path.join(app.config['UPLOAD_FOLDER'], collection_name))
		return json.dumps({"success": True, "redirect": url_for('.visualize')})

	return json.dumps({"success": False})
示例#30
0
    faq = pd.read_csv('../data/interim/faq-text-separated.csv',
                      keep_default_na=False)
    test_questions = pd.read_csv('../data/test/test-questions.csv')
    features = ['Topic', 'Category', 'Department', 'question', 'answer']

    test_topics = pd.read_excel(
        '../../../Inquire Boulder request data- detailed open and closed - for research purposes.xlsx'
    )
    test_topics = test_topics[['Description', 'Topic']]
    test_topics = test_topics.rename(index=str,
                                     columns={
                                         "Description": "test_question",
                                         "Topic": "match_topic"
                                     })

    #     # Evaluate KDTree on questions
    #     kdtree = KDTREE(faq, features, 'KDTREE')
    #     kdtree.evaluate(test_questions, 'questions')

    #     # Evaluate Word2Vec on questions
    #     w2v = W2V(faq, features, 'W2V')
    #     w2v.evaluate(test_questions, 'questions')

    #     w2v.evaluate(test_topics, 'topics')

    # Evaluate TFIDF on questions and Topics
    tfidf = TFIDF(faq, features, 'TFIDF')
    tfidf.evaluate(test_questions, 'questions')

#     tfidf.evaluate(test_topics, 'topics')
class MovieTensor:
    model = None
    db = None
    tfIdf = None

    def __init__(self, model):
        self.model = model
        self.db = DBConnect()
        self.tfIdf = TFIDF("", "", "_actor_")

    def getListAsString(self, moviesList):
        moviesListStr = str(moviesList)
        moviesListStr = moviesListStr.replace('[', '(')
        moviesListStr = moviesListStr.replace(']', ')')
        return moviesListStr

    def getTensor(self):
        if self.model == 1:

            yearsCountQuery = "select count(distinct year) from mlmovies"
            #movieActorsCountQuery = "select count(distinct movieid) from mlmovies where movieid  in (6058,9818,5914,6097,7232,9443,7062,8929,4354,10059)  "
            res = self.db.executeQuery(yearsCountQuery)
            countStr = res[0]
            countString = str(countStr)
            countString = self.tfIdf.getCount(countString)
            noOfDistinctYear = int(countString)

            # get the no of actors
            movieActorsCountQuery = "select count(*) from imdb_actor_info  "
            #movieActorsCountQuery = "select count(distinct actorid) from imdb_actor_info  where actorid in (17838,45899,61523,68671,96585,99457,128645,133985) "
            res = self.db.executeQuery(movieActorsCountQuery)
            countStr = res[0]
            countString = str(countStr)
            countString = self.tfIdf.getCount(countString)
            noOfActors = int(countString)

            # get the no of movies
            movieActorsCountQuery = "select count(*) from mlmovies  "
            #movieActorsCountQuery = "select count(distinct movieid) from mlmovies where movieid  in (6058,9818,5914,6097,7232,9443,7062,8929,4354,10059)  "
            res = self.db.executeQuery(movieActorsCountQuery)
            countStr = res[0]
            countString = str(countStr)
            countString = self.tfIdf.getCount(countString)
            noOfMovies = int(countString)
            #noOfMovies = 2

            #            actorMovieYearTensor = np.ndarray(  shape=(noOfActors,noOfMovies,noOfDistinctYear))
            #            for i in range(0,noOfActors):
            #                for j in range(0,noOfMovies):
            #                    for k in range(0,noOfDistinctYear):
            #                        actorMovieYearTensor[i,j,k] = 0.0
            #                        #print actorMovieYearTensor[i,j,k]

            #build movie indices
            movieIdVsIndex = {}
            movieIndexVsName = {}
            query = "select * from mlmovies order by movieid"
            #query = "select *  from mlmovies where movieid  in (6058,9818,5914,6097,7232,9443,7062,8929,4354,10059) order by movieid"
            movieIndex = 0
            res = self.db.executeQuery(query)
            for movie in res:
                movieId = movie[0]
                movieName = movie[1]
                movieIdVsIndex[movieId] = movieIndex
                movieIndexVsName[movieIndex] = movieName
                movieIndex = movieIndex + 1

            #build year indices
            yearVsIndex = {}
            yearIndexVsYear = {}
            q = "select distinct year from mlmovies order by year"
            res = self.db.executeQuery(q)
            yearIndex = 0
            for yearRow in res:
                year = yearRow[0]
                yearVsIndex[str(year)] = yearIndex
                yearIndexVsYear[yearIndex] = year
                yearIndex = yearIndex + 1

            actorMovieYearMatrix = np.zeros(
                (noOfActors, noOfMovies, noOfDistinctYear))

            query = "select * from imdb_actor_info order by actorid "
            actors = self.db.executeQuery(query)
            actorIndex = 0
            actorIdVsIndex = {}
            actorIndexVsName = {}
            for actor in actors:
                actorid = actor[0]
                actorName = actor[1]
                actorrelatedMoviesQ = "select * from movie_actor where actorid = " + str(
                    actorid)
                actorrelatedMovies = self.db.executeQuery(actorrelatedMoviesQ)
                movieIds = []
                for movie in actorrelatedMovies:
                    movieIds.append(movie[0])
                # we got the movies
                moviesQuery = "select * from mlmovies where movieid in " + self.getListAsString(
                    movieIds)
                res = self.db.executeQuery(moviesQuery)
                for movieYear in res:
                    movieid = movieYear[0]
                    year = movieYear[2]
                    #actorMovieYearTensor[actorIndex,movieIdVsIndex[movieid],yearVsIndex[str(year)]] = 1.0
                    actorMovieYearMatrix[actorIndex][movieIdVsIndex[movieid]][
                        yearVsIndex[str(year)]] = 1

                actorIdVsIndex[actorid] = actorIndex
                actorIndexVsName[actorIndex] = actorName
                actorIndex = actorIndex + 1

            actorMovieYearMatrix[0][0][0] = 1
            actorMovieYearMatrix[1][1][1] = 1
            actorMovieYearTensor = tl.tensor(actorMovieYearMatrix)

            decomposed = dec.parafac(actorMovieYearTensor, rank=5)

            semanticsActor = decomposed[0]
            semanticsMovie = decomposed[1]
            semanticsYear = decomposed[2]
            for i in range(0, semanticsActor.shape[1]):

                actorsRow = semanticsActor[:, i]
                mean = np.mean(actorsRow)
                print("ACTORS GROUPED UNDER LATENT SEMANTICS {0} ".format(i +
                                                                          1))
                for j in range(0, noOfActors):
                    if (actorsRow[j] >= mean):
                        print(actorIndexVsName[j])

            for i in range(0, semanticsMovie.shape[1]):

                moviesRow = semanticsMovie[:, i]
                mean = np.mean(moviesRow)
                print("MOVIES GROUPED UNDER LATENT SEMANTICS {0}".format(i +
                                                                         1))
                for j in range(0, noOfMovies):
                    if (moviesRow[j] >= mean):
                        print(movieIndexVsName[j])

            for i in range(0, semanticsYear.shape[1]):
                yearsRow = semanticsYear[:, i]
                mean = np.mean(yearsRow)
                print("YEARS GROUPED UNDER LATENT SEMANTICS {0}".format(i + 1))
                for j in range(0, noOfDistinctYear):
                    if (yearsRow[j] >= mean):
                        print(yearIndexVsYear[j])

        elif self.model == 2:
            noOfTags = 0
            query = "select count(*) from genome_tags"
            count = self.db.executeQuery(query)
            countStr = self.tfIdf.getCount(str(count[0]))
            noOfTags = int(countStr)

            # get the no of movies
            movieActorsCountQuery = "select count(*) from mlmovies  "
            res = self.db.executeQuery(movieActorsCountQuery)
            countStr = res[0]
            countString = str(countStr)
            countString = self.tfIdf.getCount(countString)
            noOfMovies = int(countString)

            q = "select count(distinct rating) from mlratings"
            res = self.db.executeQuery(q)
            countStr = res[0]
            countString = str(countStr)
            countString = self.tfIdf.getCount(countString)
            noOfRatings = int(countString)

            tagMovieRatingMatrix = np.zeros(
                (noOfTags, noOfMovies, noOfRatings))

            #print tagMovieRatingTensor

            # build tag index
            query = "select * from genome_tags order by tagid"
            tags = self.db.executeQuery(query)
            tagIndex = 0
            tagIdVsIndex = {}
            tagIndexVsName = {}
            for tag in tags:
                tagid = tag[0]
                tagName = tag[1]
                tagIdVsIndex[tagid] = tagIndex
                tagIndexVsName[tagIndex] = tagName
                tagIndex = tagIndex + 1

            query = "select * from mlmovies order  by movieid"
            movieIndex = 0
            movieIdVsIndex = {}
            movieIndexVsName = {}
            movies = self.db.executeQuery(query)
            for movie in movies:
                movieid = movie[0]
                movieName = movie[1]
                movieIdVsIndex[movieid] = movieIndex
                movieIndexVsName[movieIndex] = movieName

                movieTagsQ = "select * from mltags where movieid = " + str(
                    movieid)
                movieTags = self.db.executeQuery(movieTagsQ)
                movieTagsList = []
                for movieTag in movieTags:
                    movieTagsList.append(movieTag[2])
                totalNoOfRatingsQ = "select count(*) from mlratings where movieid = " + str(
                    movieid)
                res = self.db.executeQuery(totalNoOfRatingsQ)
                totalRatingsStr = self.tfIdf.getCount(str(res[0]))
                totalRatings = int(totalRatingsStr)

                sumQ = "select movieid, sum(rating) from mlratings  where movieid = " + str(
                    movieid) + " group by movieid"
                res = self.db.executeQuery(sumQ)
                sumRating = 0
                for r in res:
                    sumRating = sumRating + r[1]
                avgRating = float(sumRating) / totalRatings

                for tag in movieTagsList:
                    tagIndex = tagIdVsIndex[tag]

                    for i in range(1, noOfRatings + 1):
                        if avgRating <= float(i):
                            tagMovieRatingMatrix[tagIndex][movieIndex][i -
                                                                       1] = 1
                            #print "setting one"

                movieIndex = movieIndex + 1

            tagMovieRatingMatrix[0][0][0] = 1
            tagMovieRatingMatrix[1][1][1] = 1
            tagMovieRatingTensor = tl.tensor(tagMovieRatingMatrix)

            decomposed = dec.parafac(tagMovieRatingTensor, rank=5)

            semanticsTag = decomposed[0]
            semanticsMovie = decomposed[1]
            semanticsRating = decomposed[2]

            for i in range(0, semanticsTag.shape[1]):

                tagRows = semanticsTag[:, i]
                mean = np.mean(tagRows)
                print(" TAGS GROUPED UNDER LATENT SEMANTICS {0} ".format(i +
                                                                         1))
                for j in range(0, noOfTags):
                    if (tagRows[j] >= mean):
                        print(tagIndexVsName[j])

            for i in range(0, semanticsMovie.shape[1]):

                movieRows = semanticsMovie[:, i]
                mean = np.mean(movieRows)
                print("MOVIES GROUPED UNDER LATENT SEMANTICS {0}".format(i +
                                                                         1))
                for j in range(0, noOfMovies):
                    if (movieRows[j] >= mean):
                        print(movieIndexVsName[j])

            for i in range(0, semanticsRating.shape[1]):
                ratingRows = semanticsRating[:, i]
                mean = np.mean(ratingRows)
                print("RATINGS GROUPED UNDER LATENT SEMANTICS {0}".format(i +
                                                                          1))
                for j in range(0, noOfRatings):
                    if (ratingRows[j] >= mean):
                        print(j + 1)
 def __init__(self, model):
     self.model = model
     self.db = DBConnect()
     self.tfIdf = TFIDF("", "", "_actor_")
示例#33
0
def cnn_output(input_file_name, output_file_name, output_num, top_k):
    """
    Generate cnn outputs
    """
    random.seed(12345)
    retrieval_data_start_time = time.clock()
    questions, pred_questions, answers, pred_answers = Data.read_pred_data(
        input_file_name)
    # Build word --> sentence dictionary
    word_sentence_dict = Data.generate_word_sentence_dict(pred_questions)

    print("Retrieval Data Finished")

    retrieval_data_end_time = time.clock()
    print("Retrieval Data cost %f" %
          (retrieval_data_end_time - retrieval_data_start_time))

    cnn_response_start_time = time.clock()

    tfidf = TFIDF(questions, pred_questions, answers, pred_answers,
                  word_sentence_dict)
    cnn = CNN(questions,
              pred_questions,
              answers,
              pred_answers,
              word_sentence_dict,
              isTrain=False)

    if output_file_name.split(".")[-1] == "txt":
        output = open(output_file_name, "w")
        for i in range(output_num):
            qs_index = int(random.random() * len(questions))
            qs_input = questions[qs_index].encode("utf-8")
            output.write("Question : %s\n" % qs_input)
            tfidf_response_id, tfidf_response = tfidf.ask_response(
                qs_input, top_k * 10)
            cnn_response = cnn.ask_response(qs_input, top_k, tfidf_response_id)

            for i in range(top_k):
                output.write("CNN response %d: %s\n" %
                             (i + 1, cnn_response[i].encode("utf-8")))
            output.write("\n")
        output.close()
        cnn_response_end_time = time.clock()
        print("CNN response cost %f" %
              (cnn_response_end_time - cnn_response_start_time))

    if output_file_name.split(".")[-1] == "csv":
        with open(
                output_file_name,
                'w',
        ) as csvfile:
            fieldnames = ['Question']
            fieldnames.extend(["Reply " + str(i + 1) for i in range(top_k)])
            fieldnames.append("Score")
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            for i in range(output_num):
                dict = {"Score": ""}
                qs_index = int(random.random() * len(questions))
                qs_input = questions[qs_index].encode("utf-8")
                dict["Question"] = qs_input
                tfidf_response_id, tfidf_response = tfidf.ask_response(
                    qs_input, top_k * 10)
                cnn_response = cnn.ask_response(qs_input, top_k,
                                                tfidf_response_id)

                for i in range(min(top_k, len(cnn_response))):
                    dict["Reply " +
                         str(i + 1)] = cnn_response[i].encode("utf-8")
                writer.writerow(dict)
示例#34
0
corpus = []
for text in new_df['content']:
    corpus.append(text)

titles = []
for title in new_df["title"]:
    titles.append(str(title))
#labels_df starts at df[5000] so we're good on the matching of labels to content
events = []
for event in labels_df["Event"][:1000]:
    events.append(str(event))


from TFIDF import TFIDF
#creates TFIDF matrix
TFIDF(corpus)

##############################################################################
###################KMEANS#####################################################
##############################################################################
from sklearn.externals import joblib
#Loads my pre-existing kmeans model
#Saves the model you just made
#joblib.dump(km, '700_No_Ngram.pkl')
km = joblib.load("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/KMeans_Cluster_Models/350_no_Ngram.pkl")
clusters = km.labels_.tolist()



#Only to create a new kmeans model
from sklearn.cluster import KMeans
示例#35
0
def main():

    command = sys.argv[1]
    no = int(sys.argv[2])
    if command == "CP":
        if no == 1:

            tensor = MovieTensor(1)
            tensor.getTensor()
        elif no == 2:
            tensor = MovieTensor(2)
            tensor.getTensor()
    elif command == "SVD":
        allactormoviesdata = pandas.read_csv("movie-actor.csv")

        alltagsdata = pandas.read_csv("mltags.csv")

        allactormoviesdata['max_actor_rank'] = allactormoviesdata.groupby(
            ['movieid'])['actor_movie_rank'].transform(max)
        allactormoviesdata['min_actor_rank'] = allactormoviesdata.groupby(
            ['movieid'])['actor_movie_rank'].transform(min)

        allactormoviesdata['actor_rank_weightage'] = allactormoviesdata.apply(
            compute_actor_weightage, axis=1)
        #
        # print(allactormoviesdata)

        min_timestamp = pandas.to_datetime(min(alltagsdata['timestamp']))
        max_timestamp = pandas.to_datetime(max(alltagsdata['timestamp']))

        alltagsdata['timestamp_weightage'] = alltagsdata.apply(
            CalculateTimestampWeights,
            axis=1,
            args=(min_timestamp, max_timestamp))

        mergeddata = allactormoviesdata[[
            'actorid', 'movieid', 'actor_rank_weightage'
        ]].merge(alltagsdata[['movieid', 'tagid', 'timestamp_weightage']],
                 on='movieid')

        #print(mergeddata[mergeddata['actorid'].isin([878356,1860883,316365,128645])])

        mergeddata['total_weightage'] = mergeddata.apply(
            aggregate_tf_weightages, axis=1)

        mergeddata['tag_weightage'] = mergeddata.groupby(
            ['actorid', 'tagid'])['total_weightage'].transform('sum')
        tfdata = mergeddata[['actorid', 'tagid', 'tag_weightage'
                             ]].drop_duplicates(subset=['tagid', 'actorid'])

        tfdata['total_weightage_actor'] = tfdata.groupby(
            ['actorid'])['tag_weightage'].transform('sum')

        tfdata['tf'] = tfdata.apply(ComputeTF, axis=1)

        taglist = tfdata['tagid'].tolist()
        alltagsdata = pandas.read_csv("mltags.csv")
        alltagsdata = alltagsdata[alltagsdata['tagid'].isin(taglist)]

        #print(alltagsdata)

        allactormoviesdata = pandas.read_csv("movie-actor.csv")
        requiredtagsdata = alltagsdata.merge(allactormoviesdata, on='movieid')

        requiredtagsdata.drop_duplicates(subset=['tagid', 'actorid'],
                                         inplace=True)
        requiredtagsdata['actor_count'] = requiredtagsdata.groupby(
            'tagid')['actorid'].transform('count')
        requiredtagsdata.drop_duplicates(subset=['tagid'], inplace=True)

        actordata = pandas.read_csv("imdb-actor-info.csv")
        total_actors = actordata.shape[0]

        requiredtagsdata['idf'] = requiredtagsdata.apply(
            ComputeIDF, axis=1, total_actors=total_actors)
        #
        # print(total_actors)
        # print(requiredtagsdata)

        tfidfdata = ProcessTFandIDFtoTFIDF(tfdata,
                                           requiredtagsdata[['tagid', 'idf']])

        # print(tfdata)

        #tfidfdata = tfidfdata[tfidfdata['actorid'].isin([878356,1860883,316365,128645])]

        #print(tfidfdata)

        actor_tag_matrix = tfidfdata.pivot_table(index='actorid',
                                                 columns='tagid',
                                                 values='tfidf',
                                                 fill_value=0)
        print "Actor Tag Matrix"
        print actor_tag_matrix

        tf = TFIDF("", 1, "_actor_")
        tf.calcMoviesVector()
示例#36
0
from ExtractAbstract import ExtractAbstract
from InformationContent import InformationContent

from TFIDF import TFIDF as TFIDF
from ClusterRelatedness import ClusterRelatedness
from DimensionRelatedness import DimensionRelatedness
from RelatednessGraph import RelatednessGraph

if __name__ == "__main__":
    '''
		PART 1 - 1
		Calculate IC and TFIDF
	'''
    IC = InformationContent("./source/ic.txt")
    #DEBUG: IC.printSortedList()
    TfIdf = TFIDF("./source/tfidf.txt")
    #DEBUG: TfIdf.printSortedList()
    '''
		PART 1 - 2
		Use IC and TFIDF to extract words from abstracts
	'''
    Extractor = ExtractAbstract("./source/corpus5.csv", IC, TfIdf, 0.35,
                                0.3)  #IC THReshold / TFIDF Threst
    '''
		PART 2
		Calculate Relatedness
	'''
    # Finding Relatedness 1 - Find Vector Cluster
    # ClusterRelatedness = ClusterRelatedness("./source/vectors.txt", "./abstracts/", Extractor.fileNum)

    # Finding Relatedness 2 - Compare Word Pairs
示例#37
0
def get_Dnn_model(total_pl):

    NUM_PL = 8
    D_WORD = 300
    tf_idf = TFIDF(total_pl)
    pl_cnt, words = tf_idf.get_tfidf()

    x, y = pl_preprocessing(total_pl, NUM_PL)
    x = np.array(x)
    y = np.array(y)

    ###### test 同 training data #######
    X_train, X_test1, Y_train, y_test1 = train_test_split(x, y, test_size=0.2)
    data = CrossValidationFolds(X_train, Y_train, FOLDS)
    (X_train1, y_train1), (X_valid1, y_valid1) = data.split()

    ###### test 不同 training data #######
    # data = CrossValidationFolds(x, y, FOLDS)
    # (X_train1, y_train1), (X_valid1, y_valid1) = data.split()

    # X_test1,y_test1 = load_pl('../new_Steeve_data/filter_Dice/can/')
    # X_test1 = np.array(X_test1)
    # y_test1 = np.array(y_test1)

    ##### testing data ######
    # Tx = X_test1[0]
    # Ty = y_test1[0]
    # Tx = Tx.reshape([1,-1])
    # print(Tx.shape)
    # print(X_test1.shape)

    ### 先前設置
    FOLDS = 5
    in_units = D_WORD * NUM_PL
    n_class = 6  # 題目要求只要辨識 0 ,1 ,2 ,3 及4 ,共5個類別

    n_train = len(X_train1)  # train資料的長度
    batch_size = 50
    n_batch = n_train // batch_size

    X = tf.placeholder(tf.float32, [None, in_units],
                       name="X")  # 初始化x資料型態為[None,784]
    y = tf.placeholder(tf.int64, shape=(None), name="y")  # 初始化y資料型態[None]

    logits = L_layers_model(X, 128, n_class, 0.5)
    Y_proba = tf.nn.softmax(logits, name="Y_proba")
    loss, train_op = train_op(y, logits)
    accuracy, precision, recall = acc_model(y, logits)

    prediction = tf.argmax(Y_proba, 1)

    saver = tf.train.Saver()  # call save function
    config = tf.ConfigProto(device_count={'GPU': 1})  #指定gpu

    # Params for Train
    epochs = 1000  # 10 for augmented training data, 20 for training data
    val_step = 100  # 當 50 步時去算一次驗證資料的正確率

    # Training cycle
    max_acc = 0.  # Save the maximum accuracy value for validation data
    early_stop_limit = 0  # 紀錄early_stop的值

    init = tf.global_variables_initializer()
    init_l = tf.local_variables_initializer()

    with tf.Session(config=config) as sess:
        run(sess, X_train1, y_train1, X_valid1, y_valid1)
        sess.run(init_l)
        saver.restore(sess, '../dnn_model.ckpt')  # 開啟剛剛 early_stop 的 model

        print('Acc_test :',
              sess.run(accuracy, feed_dict={
                  X: X_test1,
                  y: y_test1
              }))
        print('Prec_value :',
              sess.run(precision, feed_dict={
                  X: X_test1,
                  y: y_test1
              }))
        print('Recall_value :',
              sess.run(recall, feed_dict={
                  X: X_test1,
                  y: y_test1
              }))
示例#38
0
corpus = []
for text in new_df['content']:
    corpus.append(text)

titles = []
for title in new_df["title"]:
    titles.append(str(title))
#labels_df starts at df[5000] so we're good on the matching of labels to content
events = []
for event in labels_df["Event"][:1000]:
    events.append(str(event))

import os
os.chdir("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/Pre-Processing")
from TFIDF import TFIDF
tfidf_matrix = TFIDF(corpus)



####################################################################
##########################HAC#######################################
####################################################################
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
hac = AgglomerativeClustering(n_clusters=500, affinity = "euclidean")
dense_matrix = tfidf_matrix.todense()
hac.fit_predict(dense_matrix)

from sklearn.externals import joblib
#Saves the model you just made
joblib.dump(hac, '350_euc_HAC.pkl')