def doStandardInitialize(remove=False): allDict = loadAllFeedsFromFile() docl = getDocList(smallDict(allDict, 500), reloaddocs=False, stop_list=getCustomStopWords()) # docl=preProcessDocs(docs) topics = deriveTopicMaps( docl, maxNum=30, ngram_range=(3, 4) ) # Produces recognisable topics but with many repetitions in different constellations updateDictionaryByFuzzyRelevanceofTopics(topics, smallDict(allDict, 500), limit=None, threshold=70, remove=remove) return allDict
def testCase(): """ Get docs from file, get list of titles+content, calculate (40) topics using 3,4 ngrams map topics Add list of topics to each entry of the given allEntryDict for each topic that has an LDA fuzzy relevance (see fuzzywuzzy process) of greater than the specified threshold. Calculate SoftCosine-Similarity matrix with WordEmbeddings fasttext_model300 (dimension 300) or GloVe (dimension 50) save matrix to file do spectral analysis and dimension reduction (PCA method) on similarity matrix plotScatter3D with tool tips """ allDict = loadAllFeedsFromFile() sm = smallDict(allDict, 500) docl = getDocList(sm, limit=None, reloaddocs=False, stop_list=getCustomStopWords()) topics = deriveTopicMaps(docl, maxNum=20, ngram_range=(3, 4)) updateDictionaryByFuzzyRelevanceofTopics(topics, sm, limit=None, threshold=60, remove=True) trix = deriveSoftCosineSimilarityMatrix(sm) saveDFPickle(trix) do3DPlotOfCosineSimilarity(sm, None, trix) return
def testDisplayTopics(numArticles=None, numTopics=30, dict=None): if not dict: dict = loadAllFeedsFromFile() if bool(numArticles): small = smallDict(dict, numArticles) else: small = dict docl = getDocList(small, reloaddocs=False, stop_list=getCustomStopWords()) # docl=getDocList(small, reloaddocs=False) topics = deriveTopicMaps(docl, maxNum=numTopics, ngram_range=(3, 3)) updateDictionaryByFuzzyRelevanceofTopics(topics, small, limit=30) displayTopics(topics) return
def testDisplayTags(numArticles=300, numTopics=30, dict=None): if not dict: dict = loadAllFeedsFromFile() small = smallDict(dict, numArticles) displayTags(small, numTopics) return #%% # allDict1=loadAllFeedsFromFile() # small=smallDict(allDict1,300) # testDisplayTopics(dict=small) # testDisplayAuthors(dict=small) # testDisplayTags(dict=small)
def testJointPlot(allDict, size=100): sm = smallDict(allDict, size) conductSentimentAnalysis(sm) docl = getDocList(sm, reloaddocs=False, stop_list=getCustomStopWords()) topics = deriveTopicMaps(docl, maxNum=30, ngram_range=(3, 3)) updateDictionaryByFuzzyRelevanceofTopics(topics, sm, limit=None, threshold=20, remove=True) tlist = [item[0] for item in topics] # getSentimentsForTopic(tlist[0],sm) jointPlotOfSentiment(tlist[0], sm, "Positive") return
def testDisplayTopicsAndFeeds(numArticles=500, dict=None, numTopics=30, ngram_range=(3, 3)): if not dict: dict = loadAllFeedsFromFile(limitsize=numArticles) docl = getDocList(smallDict(dict, numArticles), reloaddocs=False, stop_list=getCustomStopWords()) topics = deriveTopicMaps(docl, maxNum=numTopics, ngram_range=ngram_range) updateDictionaryByFuzzyRelevanceofTopics( topics, dict) # populates topiclist in dict entries displayTopicsAndFeeds(dict) return
def runSentiment(allDict, sm, numdocs=500): if not bool(allDict): allDict = loadAllFeedsFromFile() if not bool(sm): sm = smallDict(allDict, numdocs) conductSentimentAnalysis(sm) docl = getDocList(sm, reloaddocs=False, stop_list=getCustomStopWords()) topics = deriveTopicMaps(docl, maxNum=30, ngram_range=(3, 3)) updateDictionaryByFuzzyRelevanceofTopics(topics, sm, limit=None, threshold=20, remove=True) gt = getTopicIdDict(sm) tlist = [item[0] for item in topics] top = tlist[5] df2 = getSentimentsForTopic3(top, sm) plotSentiment3D(df2, sm, notebook=False, topic=top) return
def testDisplayAuthors(numArticles=300, dict=None): if not dict: dict = loadAllFeedsFromFile() small = smallDict(dict, numArticles) displayAuthors(dict=small) return