def analysis(self): query = request.params['query'] if query.strip() == "": return render('/index.mako') conn = solr.SolrConnection('http://localhost:8983/solr') # Grab polarized data from Solr params = {'q': sanitize(query) + " sentiment:[0.0 TO 1.0]", 'rows': 10} good_results = conn.query(**(params)) params = { 'q': sanitize(query) + " sentiment:[-1.0 TO 0.0]", 'rows': 10 } bad_results = conn.query(**(params)) # Grab data from Solr params = {'q': sanitize(query), 'rows': 500, 'start': 0} results = conn.query(**(params)) conn.close() # Do ngrams analysis goodText = ''.join(hit['content'] for hit in good_results.results) badText = ''.join(hit['content'] for hit in bad_results.results) c.goodTerms, c.badTerms = ngrams.main(goodText, badText, ngrams.getWordsForDisplay) c.goodCount = good_results.numFound c.badCount = bad_results.numFound c.goodResults = good_results c.badResults = bad_results c.query = query if len(results) == 0: return render('/noresults.mako') c.results = results c.service = "analysis" return render('/analysis.mako')
def getBlogPost(self, filter): conn = solr.SolrConnection('http://localhost:8983/solr') print filter parsedFilter = SearchFilter() parsedFilter.decode(filter) params = parsedFilter.toSolr() params['q'] = params['q'] + " sentiment:[0.75 TO 1.0]" good_response = conn.query(**(params)) params = parsedFilter.toSolr() params['q'] = params['q'] + " sentiment:[0.0 TO 0.25]" bad_response = conn.query(**(params)) conn.close() goodText = "" badText = "" results = good_response.results + bad_response.results for hit in results: if hit['sentiment'] == True: goodText += hit['content'] elif hit['sentiment'] == False: badText += hit['content'] q, p = ngrams.main(goodText, badText, ngrams.getWordsForDisplay) return (q, p, [(hit['title'], hit['content'], hit['sentiment']) for hit in results if hit['content']])
def polarize(self): c.service = "polarize" query = request.params['query'] if query.strip() == "": return render('/index.mako') conn = solr.SolrConnection('http://localhost:8983/solr') # Grab data from Solr params = {'q': sanitize(query) + " sentiment:[0.0 TO 1.0]", 'rows': 10} good_results = conn.query(**(params)) params = { 'q': sanitize(query) + " sentiment:[-1.0 TO 0.0]", 'rows': 10 } bad_results = conn.query(**(params)) conn.close() if not (good_results or bad_results): c.query = query return render('/noresults.mako') # Do ngrams analysis goodText = ''.join(hit['content'] for hit in good_results.results) badText = ''.join(hit['content'] for hit in bad_results.results) q, p = ngrams.main(goodText, badText, ngrams.getWordsForDisplay) # Send params to context c.goodTerms = q #[i[0] for i in q] c.badTerms = p #[i[0] for i in p] c.goodResults = good_results c.badResults = bad_results c.query = query return render('/polarize.mako')
def analysis(self): query = request.params['query'] if query.strip() == "": return render('/index.mako') conn = solr.SolrConnection('http://localhost:8983/solr') # Grab polarized data from Solr params = {'q': sanitize(query) + " sentiment:[0.0 TO 1.0]", 'rows' : 10} good_results = conn.query(**(params)) params = {'q': sanitize(query) + " sentiment:[-1.0 TO 0.0]", 'rows' : 10} bad_results = conn.query(**(params)) # Grab data from Solr params = {'q': sanitize(query), 'rows' : 500, 'start': 0} results = conn.query(**(params)) conn.close() # Do ngrams analysis goodText = ''.join(hit['content'] for hit in good_results.results) badText = ''.join(hit['content'] for hit in bad_results.results) c.goodTerms, c.badTerms = ngrams.main(goodText, badText, ngrams.getWordsForDisplay) c.goodCount = good_results.numFound c.badCount = bad_results.numFound c.goodResults = good_results c.badResults = bad_results c.query = query if len(results) == 0: return render('/noresults.mako') c.results = results c.service = "analysis" return render('/analysis.mako')
def polarize(self): c.service = "polarize" query = request.params['query'] if query.strip() == "": return render('/index.mako') conn = solr.SolrConnection('http://localhost:8983/solr') # Grab data from Solr params = {'q': sanitize(query) + " sentiment:[0.0 TO 1.0]", 'rows' : 10} good_results = conn.query(**(params)) params = {'q': sanitize(query) + " sentiment:[-1.0 TO 0.0]", 'rows' : 10} bad_results = conn.query(**(params)) conn.close() if not (good_results or bad_results): c.query = query return render('/noresults.mako') # Do ngrams analysis goodText = ''.join(hit['content'] for hit in good_results.results) badText = ''.join(hit['content'] for hit in bad_results.results) q, p = ngrams.main(goodText, badText, ngrams.getWordsForDisplay) # Send params to context c.goodTerms = q #[i[0] for i in q] c.badTerms = p #[i[0] for i in p] c.goodResults = good_results c.badResults = bad_results c.query = query return render('/polarize.mako')
def main(language=None, corpus=None, datafolder=None, filename=None, maxwordtokens=0, use_corpus=True): print("\n*****************************************************\n" "Running the phon.py program now...\n") infilename, corpusName = get_wordlist_path_corpus_stem(language, corpus, datafolder, filename, maxwordtokens, use_corpus) if not infilename.exists(): if use_corpus: if maxwordtokens: warning = " ({} tokens)".format(maxwordtokens) else: warning = "" print("\nWordlist for {}{} not found.\n" "ngrams.py is now run.\n".format(corpus, warning)) ngrams.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename, maxwordtokens=maxwordtokens) else: sys.exit("\nThe specified wordlist ""\n" "is not found.".format(infilename)) if filename: outfolder = Path(Path(filename).parent, "phon") else: outfolder = Path(datafolder, language, 'phon') if not outfolder.exists(): outfolder.mkdir(parents=True) outfilenamePhones = Path(outfolder, corpusName + "_phones.txt") outfilenameBiphones = Path(outfolder, corpusName + "_biphones.txt") outfilenameTriphones = Path(outfolder, corpusName + "_triphones.txt") phoneDict = Counter() triphoneDict = Counter() biphoneDict = Counter() sep = "\t" print('Reading the wordlist file now...') with infilename.open() as f: lines = f.readlines() for line in lines: if not line or line.startswith("#"): continue line = line.strip().casefold() phones, *rest = line.split() try: freq = int(rest[0]) except (ValueError, IndexError): freq = 1 phones = "#{}#".format(phones) # add word boundaries lenPhones = len(phones) for i in range(lenPhones-2): phone1 = phones[i] phone2 = phones[i+1] phone3 = phones[i+2] phoneDict[phone3] += freq if i == 0: phoneDict[phone1] += freq phoneDict[phone2] += freq biphone = phone1 + sep + phone2 biphoneDict[biphone] += freq biphone = phone2 + sep + phone3 triphone = phone1 + sep + phone2 + sep + phone3 triphoneDict[triphone] += freq biphoneDict[biphone] += freq print("\nCompleted counting phones, biphones, and triphones.") intro_string = "# data source: {}".format(str(infilename)) phonesSorted = sorted_alphabetized(phoneDict.items(), key=lambda x: x[1], reverse=True) biphonesSorted = sorted_alphabetized(biphoneDict.items(), key=lambda x: x[1], reverse=True) triphonesSorted = sorted_alphabetized(triphoneDict.items(), key=lambda x: x[1], reverse=True) #--------------------------------------------------------------------------# # generate .txt output files #--------------------------------------------------------------------------# with outfilenamePhones.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(phonesSorted)), file=f) print("# token count: {}".format(str(sum(phoneDict.values()))), file=f) for (phone, freq) in phonesSorted: print(phone + sep + str(freq), file=f) with outfilenameBiphones.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(biphonesSorted)), file=f) print("# token count: {}".format(str(sum(biphoneDict.values()))), file=f) for (biphone, freq) in biphonesSorted: print(biphone + sep + str(freq), file=f) with outfilenameTriphones.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(triphonesSorted)), file=f) print("# token count: {}".format(str(sum(triphoneDict.values()))), file=f) for (triphone, freq) in triphonesSorted: print(triphone + sep + str(freq), file=f) #--------------------------------------------------------------------------# # generate .json output files #--------------------------------------------------------------------------# outfilenamePhones_json = changeFilenameSuffix(outfilenamePhones, '.json') with outfilenamePhones_json.open('w') as f: json_pdump(phoneDict, f, key=lambda x:x[1], reverse=True) outfilenameBiphones_json = changeFilenameSuffix(outfilenameBiphones, '.json') with outfilenameBiphones_json.open('w') as f: json_pdump(biphoneDict, f, key=lambda x:x[1], reverse=True) outfilenameTriphones_json = changeFilenameSuffix(outfilenameTriphones, '.json') with outfilenameTriphones_json.open('w') as f: json_pdump(triphoneDict, f, key=lambda x:x[1], reverse=True) print('phone, biphone and triphone files ready') stdout_list("Output files:", outfilenamePhones, outfilenameBiphones, outfilenameTriphones, outfilenamePhones_json, outfilenameBiphones_json, outfilenameTriphones_json)
def main(language=None, corpus=None, datafolder=None, filename=None, maxwordtokens=0, use_corpus=True): print("\n*****************************************************\n" "Running the phon.py program now...\n") infilename, corpusName = get_wordlist_path_corpus_stem( language, corpus, datafolder, filename, maxwordtokens, use_corpus) if not infilename.exists(): if use_corpus: if maxwordtokens: warning = " ({} tokens)".format(maxwordtokens) else: warning = "" print("\nWordlist for {}{} not found.\n" "ngrams.py is now run.\n".format(corpus, warning)) ngrams.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename, maxwordtokens=maxwordtokens) else: sys.exit("\nThe specified wordlist " "\n" "is not found.".format(infilename)) if filename: outfolder = Path(Path(filename).parent, "phon") else: outfolder = Path(datafolder, language, 'phon') if not outfolder.exists(): outfolder.mkdir(parents=True) outfilenamePhones = Path(outfolder, corpusName + "_phones.txt") outfilenameBiphones = Path(outfolder, corpusName + "_biphones.txt") outfilenameTriphones = Path(outfolder, corpusName + "_triphones.txt") phoneDict = Counter() triphoneDict = Counter() biphoneDict = Counter() sep = "\t" print('Reading the wordlist file now...') with infilename.open() as f: lines = f.readlines() for line in lines: if not line or line.startswith("#"): continue line = line.strip().casefold() phones, *rest = line.split() try: freq = int(rest[0]) except (ValueError, IndexError): freq = 1 phones = "#{}#".format(phones) # add word boundaries lenPhones = len(phones) for i in range(lenPhones - 2): phone1 = phones[i] phone2 = phones[i + 1] phone3 = phones[i + 2] phoneDict[phone3] += freq if i == 0: phoneDict[phone1] += freq phoneDict[phone2] += freq biphone = phone1 + sep + phone2 biphoneDict[biphone] += freq biphone = phone2 + sep + phone3 triphone = phone1 + sep + phone2 + sep + phone3 triphoneDict[triphone] += freq biphoneDict[biphone] += freq print("\nCompleted counting phones, biphones, and triphones.") intro_string = "# data source: {}".format(str(infilename)) phonesSorted = sorted_alphabetized(phoneDict.items(), key=lambda x: x[1], reverse=True) biphonesSorted = sorted_alphabetized(biphoneDict.items(), key=lambda x: x[1], reverse=True) triphonesSorted = sorted_alphabetized(triphoneDict.items(), key=lambda x: x[1], reverse=True) #--------------------------------------------------------------------------# # generate .txt output files #--------------------------------------------------------------------------# with outfilenamePhones.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(phonesSorted)), file=f) print("# token count: {}".format(str(sum(phoneDict.values()))), file=f) for (phone, freq) in phonesSorted: print(phone + sep + str(freq), file=f) with outfilenameBiphones.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(biphonesSorted)), file=f) print("# token count: {}".format(str(sum(biphoneDict.values()))), file=f) for (biphone, freq) in biphonesSorted: print(biphone + sep + str(freq), file=f) with outfilenameTriphones.open('w') as f: print(intro_string, file=f) print("# type count: {}".format(len(triphonesSorted)), file=f) print("# token count: {}".format(str(sum(triphoneDict.values()))), file=f) for (triphone, freq) in triphonesSorted: print(triphone + sep + str(freq), file=f) #--------------------------------------------------------------------------# # generate .json output files #--------------------------------------------------------------------------# outfilenamePhones_json = changeFilenameSuffix(outfilenamePhones, '.json') with outfilenamePhones_json.open('w') as f: json_pdump(phoneDict, f, key=lambda x: x[1], reverse=True) outfilenameBiphones_json = changeFilenameSuffix(outfilenameBiphones, '.json') with outfilenameBiphones_json.open('w') as f: json_pdump(biphoneDict, f, key=lambda x: x[1], reverse=True) outfilenameTriphones_json = changeFilenameSuffix(outfilenameTriphones, '.json') with outfilenameTriphones_json.open('w') as f: json_pdump(triphoneDict, f, key=lambda x: x[1], reverse=True) print('phone, biphone and triphone files ready') stdout_list("Output files:", outfilenamePhones, outfilenameBiphones, outfilenameTriphones, outfilenamePhones_json, outfilenameBiphones_json, outfilenameTriphones_json)
def main(language=None, corpus=None, datafolder=None, filename=None, maxwordtypes=1000, nNeighbors=9, nEigenvectors=11, create_WordToContexts=False, create_ContextToWords=False, mincontexts=3, usesigtransforms=True): print("\n*****************************************************\n" "Running the manifold.py program now...\n") if filename: corpusStem = Path(filename).stem infolder = Path(Path(filename).parent, 'ngrams') outfolder = Path(Path(filename).parent, 'neighbors') outcontextsfolder = Path(Path(filename).parent, 'word_contexts') else: corpusStem = Path(corpus).stem infolder = Path(datafolder, language, 'ngrams') outfolder = Path(datafolder, language, 'neighbors') outcontextsfolder = Path(datafolder, language, 'word_contexts') if not outfolder.exists(): outfolder.mkdir(parents=True) if not outcontextsfolder.exists(): outcontextsfolder.mkdir(parents=True) infileWordsname = Path(infolder, corpusStem + '_words.txt') infileBigramsname = Path(infolder, corpusStem + '_bigrams.txt') infileTrigramsname = Path(infolder, corpusStem + '_trigrams.txt') if (not infileWordsname.exists()) or \ (not infileBigramsname.exists()) or \ (not infileTrigramsname.exists()): print("Error in locating n-gram data files.\n" "The program now creates them.\n") ngrams.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename) if usesigtransforms: if filename: infolderlxa = Path(Path(filename).parent, 'lxa') else: infolderlxa = Path(datafolder, language, 'lxa') sigtransform_json_fname = Path( infolderlxa, corpusStem + "_WordToSigtransforms.json") try: WordToSigtransforms = json_pload(sigtransform_json_fname.open()) except FileNotFoundError: print("The file \"{}\" is not found.\n" "The program now creates it.\n".format( sigtransform_json_fname)) lxa5.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename) WordToSigtransforms = json_pload(sigtransform_json_fname.open()) # WordToSigtransforms just read into the program; to be used soon... print('Reading word list...', flush=True) mywords = GetMyWords(infileWordsname, corpus) print("Word file is", infileWordsname, flush=True) print("Number of neighbors to find for each word type: ", nNeighbors) print('Corpus has', len(mywords), 'word types', flush=True) lenMywords = len(mywords) if lenMywords > maxwordtypes: nWordsForAnalysis = maxwordtypes else: nWordsForAnalysis = lenMywords print('number of words for analysis adjusted to', nWordsForAnalysis) analyzedwordlist = list(mywords.keys())[:nWordsForAnalysis] worddict = {w: analyzedwordlist.index(w) for w in analyzedwordlist} corpusName = corpusStem + '_' + str(nWordsForAnalysis) + '_' + str( nNeighbors) outfilenameNeighbors = Path(outfolder, corpusName + "_neighbors.txt") outfilenameSharedcontexts = Path(outfolder, corpusName + \ "_shared_contexts.txt") outfilenameNeighborGraph = Path(outfolder, corpusName + "_neighbors.gexf") outfilenameImportantContextToWords = Path(outfolder, corpusName + \ "_ImportantContextToWords.txt") outWordToContexts_json = Path(outcontextsfolder, corpusName + \ "_WordToContexts.json") outContextToWords_json = Path(outcontextsfolder, corpusName + \ "_ContextToWords.json") print("Reading bigrams/trigrams and computing context array...", flush=True) context_array, contextdict, \ WordToContexts, ContextToWords = GetContextArray(nWordsForAnalysis, worddict, infileBigramsname, infileTrigramsname, mincontexts) print("Computing shared context master matrix...", flush=True) CountOfSharedContexts = context_array.dot(context_array.T).todense() del context_array print("Computing diameter...", flush=True) Diameter = Normalize(nWordsForAnalysis, CountOfSharedContexts) print("Computing incidence graph...", flush=True) incidencegraph = compute_incidence_graph(nWordsForAnalysis, Diameter, CountOfSharedContexts) del CountOfSharedContexts print("Computing mylaplacian...", flush=True) mylaplacian = compute_laplacian(nWordsForAnalysis, Diameter, incidencegraph) del Diameter del incidencegraph print("Computing eigenvectors...", flush=True) myeigenvalues, myeigenvectors = GetEigenvectors(mylaplacian) del mylaplacian del myeigenvalues print('Computing distances between words...', flush=True) # take first N columns of eigenvector matrix coordinates = myeigenvectors[:, :nEigenvectors] wordsdistance = compute_words_distance(nWordsForAnalysis, coordinates) del coordinates print('Computing nearest neighbors now... ', flush=True) closestNeighbors = compute_closest_neighbors(wordsdistance, nNeighbors) WordToNeighbors_by_str = OrderedDict() WordToNeighbors = dict() for wordno in range(nWordsForAnalysis): line = closestNeighbors[wordno] word_idx, neighbors_idx = line[0], line[1:] word = analyzedwordlist[word_idx] neighbors = [analyzedwordlist[idx] for idx in neighbors_idx] WordToNeighbors_by_str[word] = neighbors WordToNeighbors[word_idx] = neighbors_idx del closestNeighbors with outfilenameNeighbors.open('w') as f: print("# language: {}\n# corpus: {}\n" "# Number of word types analyzed: {}\n" "# Number of neighbors: {}\n".format(language, corpus, nWordsForAnalysis, nNeighbors), file=f) for word, neighbors in WordToNeighbors_by_str.items(): print(word, " ".join(neighbors), file=f) neighbor_graph = GetMyGraph(WordToNeighbors_by_str) # output manifold as gexf data file nx.write_gexf(neighbor_graph, str(outfilenameNeighborGraph)) # output manifold as json for d3 visualization manifold_json_data = json_graph.node_link_data(neighbor_graph) outfilenameManifoldJson = Path(outfolder, corpusName + "_manifold.json") json.dump(manifold_json_data, outfilenameManifoldJson.open("w"), indent=2) WordToNeighbors_json = changeFilenameSuffix(outfilenameNeighbors, ".json") json_pdump(WordToNeighbors_by_str, WordToNeighbors_json.open("w"), asis=True) print("Computing shared contexts among neighbors...", flush=True) WordToSharedContextsOfNeighbors, \ ImportantContextToWords = compute_WordToSharedContextsOfNeighbors( nWordsForAnalysis, WordToContexts, WordToNeighbors, ContextToWords, nNeighbors, mincontexts) output_WordToSharedContextsOfNeighbors(outfilenameSharedcontexts, WordToSharedContextsOfNeighbors, worddict, contextdict, nWordsForAnalysis) output_ImportantContextToWords(outfilenameImportantContextToWords, ImportantContextToWords, contextdict, worddict) outputfilelist = [ outfilenameNeighbors, outfilenameNeighborGraph, WordToNeighbors_json, outfilenameSharedcontexts, outfilenameImportantContextToWords, outfilenameManifoldJson ] if create_WordToContexts: outputfilelist.append(outWordToContexts_json) json_pdump(WordToContexts, outWordToContexts_json.open("w"), key=lambda x: len(x[1]), reverse=True) if create_ContextToWords: outputfilelist.append(outContextToWords_json) json_pdump(ContextToWords, outContextToWords_json.open("w"), key=lambda x: len(x[1]), reverse=True) stdout_list("Output files:", *outputfilelist)
def main(language=None, corpus=None, datafolder=None, filename=None, MinimumStemLength=4, MinimumAffixLength=1, SF_threshold=3, maxwordtokens=0, use_corpus=True): print("\n*****************************************************\n" "Running the tries.py program now...\n") #--------------------------------------------------------------------## # read wordlist #--------------------------------------------------------------------## print("reading wordlist...", flush=True) wordlist_path, corpusName = get_wordlist_path_corpus_stem(language, corpus, datafolder, filename, maxwordtokens, use_corpus) print("wordlist file path:\n{}\n".format(wordlist_path)) if not wordlist_path.exists(): if use_corpus: if maxwordtokens: warning = " ({} tokens)".format(maxwordtokens) else: warning = "" print("\nWordlist for {}{} not found.\n" "ngrams.py is now run.\n".format(corpus, warning)) ngrams.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename, maxwordtokens=maxwordtokens) else: sys.exit("\nThe specified wordlist ""\n" "is not found.".format(wordlist_path)) wordFreqDict = read_word_freq(wordlist_path) wordlist = sorted(wordFreqDict.keys()) reversedwordlist = sorted([x[::-1] for x in wordlist]) #--------------------------------------------------------------------## # output settings #--------------------------------------------------------------------## if filename: outfolder = Path(Path(filename).parent, "tries") else: outfolder = Path(datafolder, language, "tries") if not outfolder.exists(): outfolder.mkdir(parents=True) outfile_SF_name = Path(outfolder, corpusName + "_SF.txt") outfile_trieLtoR_name = Path(outfolder, corpusName + "_trieLtoR.txt") outfile_trieRtoL_name = Path(outfolder, corpusName + "_trieRtoL.txt") outfile_PF_name = Path(outfolder, corpusName + "_PF.txt") outfile_Signatures_name = Path(outfolder, corpusName + "_Signatures.txt") #--------------------------------------------------------------------## # Find breaks in words (left-to-right and right-to-left) #--------------------------------------------------------------------## print("finding breaks in words...", flush=True) breaks_LtoR = findBreaksInWords(wordlist, MinimumStemLength) breaks_RtoL = findBreaksInWords(reversedwordlist, MinimumStemLength) #--------------------------------------------------------------------## # Break up each word (left-to-right and right-to-left) #--------------------------------------------------------------------## WordsBrokenLtoR = BreakUpEachWord(wordlist, breaks_LtoR) WordsBrokenRtoL = BreakUpEachWord(reversedwordlist, breaks_RtoL) #--------------------------------------------------------------------------# # Compute successors and predecessors #--------------------------------------------------------------------------# print("computing successors and predecessors...", flush=True) successors = GetSuccessors(wordlist, WordsBrokenLtoR) OutputSuccessors(outfile_SF_name, successors, SF_threshold) predecessors = GetSuccessors(reversedwordlist, WordsBrokenRtoL) OutputSuccessors(outfile_PF_name, predecessors, SF_threshold, reverse=True) outfile_SF_name_json = changeFilenameSuffix(outfile_SF_name, ".json") json_pdump(successors, outfile_SF_name_json.open("w")) outfile_PF_name_json = changeFilenameSuffix(outfile_PF_name, ".json") json_pdump(predecessors, outfile_PF_name_json.open("w")) print("printing signatures...", flush=True) OutputSignatures1(outfile_Signatures_name, successors) #--------------------------------------------------------------------------# # Print tries (left-to-right, right-to-left) #--------------------------------------------------------------------------# print("printing tries...", flush=True) OutputTrie(outfile_trieLtoR_name, wordlist, WordsBrokenLtoR) OutputTrie(outfile_trieRtoL_name, reversedwordlist, WordsBrokenRtoL, reverse=True) outfile_trieLtoR_name_json = changeFilenameSuffix(outfile_trieLtoR_name, ".json") json_pdump(WordsBrokenLtoR, outfile_trieLtoR_name_json.open("w")) outfile_trieRtoL_name_json = changeFilenameSuffix(outfile_trieRtoL_name, ".json") json_pdump(WordsBrokenRtoL, outfile_trieRtoL_name_json.open("w")) stdout_list("Output files:", outfile_SF_name, outfile_PF_name, outfile_trieLtoR_name, outfile_trieRtoL_name, outfile_Signatures_name, outfile_SF_name_json, outfile_PF_name_json, outfile_trieLtoR_name_json, outfile_trieRtoL_name_json)
try: goodText += hit['content'] except Exception , e: print 'good_results error', e pass for hit in bad_results: try: badText += hit['content'] except Exception , e: print 'bad_results error', e pass #goodText = ''.join(hit['content'] for hit in good_results) #badText = ''.join(hit['content'] for hit in bad_results) c.goodTerms, c.badTerms = ngrams.main(goodText, badText, ngrams.getWordsForDisplay) c.goodResults = good_results c.neutralResults = neutral_results c.badResults = bad_results return render('/analysis.mako') def custom(self): c.service = "search" c.query = request.params['query'] if c.query.strip() == "": return render('/index.mako') self.startAsyncSearch(c.query)
def main(language=None, corpus=None, datafolder=None, filename=None, MinimumStemLength=4, MinimumAffixLength=1, SF_threshold=3, maxwordtokens=0, use_corpus=True): print("\n*****************************************************\n" "Running the tries.py program now...\n") #--------------------------------------------------------------------## # read wordlist #--------------------------------------------------------------------## print("reading wordlist...", flush=True) wordlist_path, corpusName = get_wordlist_path_corpus_stem( language, corpus, datafolder, filename, maxwordtokens, use_corpus) print("wordlist file path:\n{}\n".format(wordlist_path)) if not wordlist_path.exists(): if use_corpus: if maxwordtokens: warning = " ({} tokens)".format(maxwordtokens) else: warning = "" print("\nWordlist for {}{} not found.\n" "ngrams.py is now run.\n".format(corpus, warning)) ngrams.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename, maxwordtokens=maxwordtokens) else: sys.exit("\nThe specified wordlist " "\n" "is not found.".format(wordlist_path)) wordFreqDict = read_word_freq(wordlist_path) wordlist = sorted(wordFreqDict.keys()) reversedwordlist = sorted([x[::-1] for x in wordlist]) #--------------------------------------------------------------------## # output settings #--------------------------------------------------------------------## if filename: outfolder = Path(Path(filename).parent, "tries") else: outfolder = Path(datafolder, language, "tries") if not outfolder.exists(): outfolder.mkdir(parents=True) outfile_SF_name = Path(outfolder, corpusName + "_SF.txt") outfile_trieLtoR_name = Path(outfolder, corpusName + "_trieLtoR.txt") outfile_trieRtoL_name = Path(outfolder, corpusName + "_trieRtoL.txt") outfile_PF_name = Path(outfolder, corpusName + "_PF.txt") outfile_Signatures_name = Path(outfolder, corpusName + "_Signatures.txt") #--------------------------------------------------------------------## # Find breaks in words (left-to-right and right-to-left) #--------------------------------------------------------------------## print("finding breaks in words...", flush=True) breaks_LtoR = findBreaksInWords(wordlist, MinimumStemLength) breaks_RtoL = findBreaksInWords(reversedwordlist, MinimumStemLength) #--------------------------------------------------------------------## # Break up each word (left-to-right and right-to-left) #--------------------------------------------------------------------## WordsBrokenLtoR = BreakUpEachWord(wordlist, breaks_LtoR) WordsBrokenRtoL = BreakUpEachWord(reversedwordlist, breaks_RtoL) #--------------------------------------------------------------------------# # Compute successors and predecessors #--------------------------------------------------------------------------# print("computing successors and predecessors...", flush=True) successors = GetSuccessors(wordlist, WordsBrokenLtoR) OutputSuccessors(outfile_SF_name, successors, SF_threshold) predecessors = GetSuccessors(reversedwordlist, WordsBrokenRtoL) OutputSuccessors(outfile_PF_name, predecessors, SF_threshold, reverse=True) outfile_SF_name_json = changeFilenameSuffix(outfile_SF_name, ".json") json_pdump(successors, outfile_SF_name_json.open("w")) outfile_PF_name_json = changeFilenameSuffix(outfile_PF_name, ".json") json_pdump(predecessors, outfile_PF_name_json.open("w")) print("printing signatures...", flush=True) OutputSignatures1(outfile_Signatures_name, successors) #--------------------------------------------------------------------------# # Print tries (left-to-right, right-to-left) #--------------------------------------------------------------------------# print("printing tries...", flush=True) OutputTrie(outfile_trieLtoR_name, wordlist, WordsBrokenLtoR) OutputTrie(outfile_trieRtoL_name, reversedwordlist, WordsBrokenRtoL, reverse=True) outfile_trieLtoR_name_json = changeFilenameSuffix(outfile_trieLtoR_name, ".json") json_pdump(WordsBrokenLtoR, outfile_trieLtoR_name_json.open("w")) outfile_trieRtoL_name_json = changeFilenameSuffix(outfile_trieRtoL_name, ".json") json_pdump(WordsBrokenRtoL, outfile_trieRtoL_name_json.open("w")) stdout_list("Output files:", outfile_SF_name, outfile_PF_name, outfile_trieLtoR_name, outfile_trieRtoL_name, outfile_Signatures_name, outfile_SF_name_json, outfile_PF_name_json, outfile_trieLtoR_name_json, outfile_trieRtoL_name_json)
def main(language=None, corpus=None, datafolder=None, filename=None, MinimumStemLength=4, MaximumAffixLength=3, MinimumNumberofSigUses=5, maxwordtokens=0, use_corpus=True): print("\n*****************************************************\n" "Running the lxa5.py program now...\n") # -------------------------------------------------------------------------# # decide suffixing or prefixing # -------------------------------------------------------------------------# suffix_languages = { "english", "french", "hungarian", "turkish", "russian", "german", "spanish", 'test' } prefix_languages = {"swahili"} if str(language).casefold() in prefix_languages: FindSuffixesFlag = False # prefixal else: FindSuffixesFlag = True # suffixal wordlist_path, corpus_stem = get_wordlist_path_corpus_stem( language, corpus, datafolder, filename, maxwordtokens, use_corpus) print("wordlist file path:\n{}\n".format(wordlist_path)) if not wordlist_path.exists(): if use_corpus: if maxwordtokens: warning = " ({} tokens)".format(maxwordtokens) else: warning = "" print("\nWordlist for {}{} not found.\n" "ngrams.py is now run.\n".format(corpus, warning)) ngrams.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename, maxwordtokens=maxwordtokens) else: sys.exit("\nThe specified wordlist " "\n" "is not found.".format(wordlist_path)) wordFreqDict = read_word_freq(wordlist_path) wordlist = sorted(wordFreqDict.keys()) if filename: outfolder = Path(Path(filename).parent, "lxa") else: outfolder = Path(datafolder, language, 'lxa') if not outfolder.exists(): outfolder.mkdir(parents=True) # TODO -- filenames not yet used in main() outfile_Signatures_name = str(outfolder) + corpus_stem + "_Signatures.txt" outfile_SigTransforms_name = str( outfolder) + corpus_stem + "_SigTransforms.txt" outfile_FSA_name = str(outfolder) + corpus_stem + "_FSA.txt" outfile_FSA_graphics_name = str( outfolder) + corpus_stem + "_FSA_graphics.png" # -------------------------------------------------------------------------# # create: BisigToTuple # (key: tuple of bisig | value: set of (stem, word1, word2) # StemToWords (key: stem | value: set of words) # SigToStems (key: tuple of sig | value: set of stems ) # StemToSig (key: str of stem | value: tuple of sig ) # WordToSigs (key: str of word | value: set of sigs ) # AffixToSigs (key: str of affix | value: set of sigs ) # -------------------------------------------------------------------------# BisigToTuple = MakeBiSignatures(wordlist, MinimumStemLength, MaximumAffixLength, FindSuffixesFlag) print("BisigToTuple ready", flush=True) StemToWords = MakeStemToWords(BisigToTuple, MinimumNumberofSigUses) print("StemToWords ready", flush=True) SigToStems = MakeSigToStems(StemToWords, MaximumAffixLength, MinimumNumberofSigUses, FindSuffixesFlag) print("SigToStems ready", flush=True) StemToSig = MakeStemToSig(SigToStems) print("StemToSig ready", flush=True) WordToSigs = MakeWordToSigs(StemToWords, StemToSig) print("WordToSigs ready", flush=True) WordToSigtransforms = MakeWordToSigtransforms(WordToSigs) print("WordToSigtransforms ready", flush=True) AffixToSigs = MakeAffixToSigs(SigToStems) print("AffixToSigs ready", flush=True) # -------------------------------------------------------------------------# # generate graphs for several dicts # -------------------------------------------------------------------------# # GenerateGraphFromDict(StemToWords, outfolder, 'StemToWords.gexf') # GenerateGraphFromDict(SigToStems, outfolder, 'SigToStems.gexf') # GenerateGraphFromDict(WordToSigs, outfolder, 'WordToSigs.gexf') # GenerateGraphFromDict(StemToSig, outfolder, 'StemToSig.gexf') # -------------------------------------------------------------------------# # -------------------------------------------------------------------------# # output stem file # -------------------------------------------------------------------------# stemfilename = Path(outfolder, '{}_StemToWords.txt'.format(corpus_stem)) OutputLargeDict(stemfilename, StemToWords, key=lambda x: len(x[1]), reverse=True, min_cell_width=25, howmanyperline=5) print('===> stem file generated:', stemfilename, flush=True) # -------------------------------------------------------------------------# # output affix file # -------------------------------------------------------------------------# affixfilename = Path(outfolder, '{}_AffixToSigs.txt'.format(corpus_stem)) OutputLargeDict(affixfilename, AffixToSigs, min_cell_width=25, key=lambda x: len(x[1]), reverse=True, howmanyperline=5, SignatureValues=True) print('===> affix file generated:', affixfilename, flush=True) # -------------------------------------------------------------------------# # output SigToStems # -------------------------------------------------------------------------# SigToStems_outfilename = Path(outfolder, corpus_stem + "_SigToStems.txt") OutputLargeDict(SigToStems_outfilename, SigToStems, key=lambda x: len(x[1]), reverse=True, howmanyperline=5, SignatureKeys=True) SigToStems_outfilename_json = changeFilenameSuffix(SigToStems_outfilename, ".json") json_pdump(SigToStems, SigToStems_outfilename_json.open("w"), key=lambda x: len(x[1]), reverse=True) print('===> output file generated:', SigToStems_outfilename, flush=True) print('===> output file generated:', SigToStems_outfilename_json, flush=True) # -------------------------------------------------------------------------# # output WordToSigs # -------------------------------------------------------------------------# WordToSigs_outfilename = Path(outfolder, corpus_stem + "_WordToSigs.txt") OutputLargeDict(WordToSigs_outfilename, WordToSigs, key=lambda x: len(x[1]), reverse=True, min_cell_width=25, SignatureValues=True) WordToSigs_outfilename_json = changeFilenameSuffix(WordToSigs_outfilename, ".json") json_pdump(WordToSigs, WordToSigs_outfilename_json.open("w"), key=lambda x: len(x[1]), reverse=True) print('===> output file generated:', WordToSigs_outfilename, flush=True) print('===> output file generated:', WordToSigs_outfilename_json, flush=True) # -------------------------------------------------------------------------# # output WordToSigtransforms # -------------------------------------------------------------------------# WordToSigtransforms_outfilename = Path( outfolder, corpus_stem + "_WordToSigtransforms.txt") OutputLargeDict(WordToSigtransforms_outfilename, WordToSigtransforms, min_cell_width=25, sigtransforms=True, key=lambda x: len(x[1]), reverse=True) print('===> output file generated:', WordToSigtransforms_outfilename, flush=True) WordToSigtransforms_outfilename_json = changeFilenameSuffix( WordToSigtransforms_outfilename, ".json") json_pdump(WordToSigtransforms, WordToSigtransforms_outfilename_json.open("w"), key=lambda x: len(x[1]), reverse=True) print('===> output file generated:', WordToSigtransforms_outfilename_json, flush=True) # -------------------------------------------------------------------------# # output the most freq word types not in any induced paradigms {the, of..} # -------------------------------------------------------------------------# wordFreqDict_sorted = sorted_alphabetized(wordFreqDict.items(), key=lambda x: x[1], reverse=True) mostFreqWordsNotInSigs_outfilename = Path( outfolder, corpus_stem + "_mostFreqWordsNotInSigs.txt") with mostFreqWordsNotInSigs_outfilename.open('w') as f: for (word, freq) in wordFreqDict_sorted: if word not in WordToSigs: print(word, freq, file=f) else: break print('===> output file generated:', mostFreqWordsNotInSigs_outfilename, flush=True) # -------------------------------------------------------------------------# # output the word types in induced paradigms # -------------------------------------------------------------------------# WordsInSigs_outfilename = Path(outfolder, corpus_stem + "_WordsInSigs.txt") with WordsInSigs_outfilename.open('w') as f: for (word, freq) in wordFreqDict_sorted: if word in WordToSigs: print(word, freq, file=f) print('===> output file generated:', WordsInSigs_outfilename, flush=True) # -------------------------------------------------------------------------# # output the word types NOT in induced paradigms # -------------------------------------------------------------------------# WordsNotInSigs_outfilename = Path(outfolder, corpus_stem + "_WordsNotInSigs.txt") with WordsNotInSigs_outfilename.open('w') as f: for (word, freq) in wordFreqDict_sorted: if word not in WordToSigs: print(word, freq, file=f) print('===> output file generated:', WordsNotInSigs_outfilename, flush=True)
try: goodText += hit['content'] except Exception, e: print 'good_results error', e pass for hit in bad_results: try: badText += hit['content'] except Exception, e: print 'bad_results error', e pass #goodText = ''.join(hit['content'] for hit in good_results) #badText = ''.join(hit['content'] for hit in bad_results) c.goodTerms, c.badTerms = ngrams.main(goodText, badText, ngrams.getWordsForDisplay) c.goodResults = good_results c.neutralResults = neutral_results c.badResults = bad_results return render('/analysis.mako') def custom(self): c.service = "search" c.query = request.params['query'] if c.query.strip() == "": return render('/index.mako') self.startAsyncSearch(c.query)
def main( language=None, corpus=None, datafolder=None, filename=None, maxwordtypes=1000, nNeighbors=9, nEigenvectors=11, create_WordToContexts=False, create_ContextToWords=False, mincontexts=3, usesigtransforms=True, ): print("\n*****************************************************\n" "Running the manifold.py program now...\n") if filename: corpusStem = Path(filename).stem infolder = Path(Path(filename).parent, "ngrams") outfolder = Path(Path(filename).parent, "neighbors") outcontextsfolder = Path(Path(filename).parent, "word_contexts") else: corpusStem = Path(corpus).stem infolder = Path(datafolder, language, "ngrams") outfolder = Path(datafolder, language, "neighbors") outcontextsfolder = Path(datafolder, language, "word_contexts") if not outfolder.exists(): outfolder.mkdir(parents=True) if not outcontextsfolder.exists(): outcontextsfolder.mkdir(parents=True) infileWordsname = Path(infolder, corpusStem + "_words.txt") infileBigramsname = Path(infolder, corpusStem + "_bigrams.txt") infileTrigramsname = Path(infolder, corpusStem + "_trigrams.txt") if (not infileWordsname.exists()) or (not infileBigramsname.exists()) or (not infileTrigramsname.exists()): print("Error in locating n-gram data files.\n" "The program now creates them.\n") ngrams.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename) if usesigtransforms: if filename: infolderlxa = Path(Path(filename).parent, "lxa") else: infolderlxa = Path(datafolder, language, "lxa") sigtransform_json_fname = Path(infolderlxa, corpusStem + "_WordToSigtransforms.json") try: WordToSigtransforms = json_pload(sigtransform_json_fname.open()) except FileNotFoundError: print('The file "{}" is not found.\n' "The program now creates it.\n".format(sigtransform_json_fname)) lxa5.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename) WordToSigtransforms = json_pload(sigtransform_json_fname.open()) # WordToSigtransforms just read into the program; to be used soon... print("Reading word list...", flush=True) mywords = GetMyWords(infileWordsname, corpus) print("Word file is", infileWordsname, flush=True) print("Number of neighbors to find for each word type: ", nNeighbors) print("Corpus has", len(mywords), "word types", flush=True) lenMywords = len(mywords) if lenMywords > maxwordtypes: nWordsForAnalysis = maxwordtypes else: nWordsForAnalysis = lenMywords print("number of words for analysis adjusted to", nWordsForAnalysis) analyzedwordlist = list(mywords.keys())[:nWordsForAnalysis] worddict = {w: analyzedwordlist.index(w) for w in analyzedwordlist} corpusName = corpusStem + "_" + str(nWordsForAnalysis) + "_" + str(nNeighbors) outfilenameNeighbors = Path(outfolder, corpusName + "_neighbors.txt") outfilenameSharedcontexts = Path(outfolder, corpusName + "_shared_contexts.txt") outfilenameNeighborGraph = Path(outfolder, corpusName + "_neighbors.gexf") outfilenameImportantContextToWords = Path(outfolder, corpusName + "_ImportantContextToWords.txt") outWordToContexts_json = Path(outcontextsfolder, corpusName + "_WordToContexts.json") outContextToWords_json = Path(outcontextsfolder, corpusName + "_ContextToWords.json") print("Reading bigrams/trigrams and computing context array...", flush=True) context_array, contextdict, WordToContexts, ContextToWords = GetContextArray( nWordsForAnalysis, worddict, infileBigramsname, infileTrigramsname, mincontexts ) print("Computing shared context master matrix...", flush=True) CountOfSharedContexts = context_array.dot(context_array.T).todense() del context_array print("Computing diameter...", flush=True) Diameter = Normalize(nWordsForAnalysis, CountOfSharedContexts) print("Computing incidence graph...", flush=True) incidencegraph = compute_incidence_graph(nWordsForAnalysis, Diameter, CountOfSharedContexts) del CountOfSharedContexts print("Computing mylaplacian...", flush=True) mylaplacian = compute_laplacian(nWordsForAnalysis, Diameter, incidencegraph) del Diameter del incidencegraph print("Computing eigenvectors...", flush=True) myeigenvalues, myeigenvectors = GetEigenvectors(mylaplacian) del mylaplacian del myeigenvalues print("Computing distances between words...", flush=True) # take first N columns of eigenvector matrix coordinates = myeigenvectors[:, :nEigenvectors] wordsdistance = compute_words_distance(nWordsForAnalysis, coordinates) del coordinates print("Computing nearest neighbors now... ", flush=True) closestNeighbors = compute_closest_neighbors(wordsdistance, nNeighbors) WordToNeighbors_by_str = OrderedDict() WordToNeighbors = dict() for wordno in range(nWordsForAnalysis): line = closestNeighbors[wordno] word_idx, neighbors_idx = line[0], line[1:] word = analyzedwordlist[word_idx] neighbors = [analyzedwordlist[idx] for idx in neighbors_idx] WordToNeighbors_by_str[word] = neighbors WordToNeighbors[word_idx] = neighbors_idx del closestNeighbors with outfilenameNeighbors.open("w") as f: print( "# language: {}\n# corpus: {}\n" "# Number of word types analyzed: {}\n" "# Number of neighbors: {}\n".format(language, corpus, nWordsForAnalysis, nNeighbors), file=f, ) for word, neighbors in WordToNeighbors_by_str.items(): print(word, " ".join(neighbors), file=f) neighbor_graph = GetMyGraph(WordToNeighbors_by_str) # output manifold as gexf data file nx.write_gexf(neighbor_graph, str(outfilenameNeighborGraph)) # output manifold as json for d3 visualization manifold_json_data = json_graph.node_link_data(neighbor_graph) outfilenameManifoldJson = Path(outfolder, corpusName + "_manifold.json") json.dump(manifold_json_data, outfilenameManifoldJson.open("w"), indent=2) WordToNeighbors_json = changeFilenameSuffix(outfilenameNeighbors, ".json") json_pdump(WordToNeighbors_by_str, WordToNeighbors_json.open("w"), asis=True) print("Computing shared contexts among neighbors...", flush=True) WordToSharedContextsOfNeighbors, ImportantContextToWords = compute_WordToSharedContextsOfNeighbors( nWordsForAnalysis, WordToContexts, WordToNeighbors, ContextToWords, nNeighbors, mincontexts ) output_WordToSharedContextsOfNeighbors( outfilenameSharedcontexts, WordToSharedContextsOfNeighbors, worddict, contextdict, nWordsForAnalysis ) output_ImportantContextToWords(outfilenameImportantContextToWords, ImportantContextToWords, contextdict, worddict) outputfilelist = [ outfilenameNeighbors, outfilenameNeighborGraph, WordToNeighbors_json, outfilenameSharedcontexts, outfilenameImportantContextToWords, outfilenameManifoldJson, ] if create_WordToContexts: outputfilelist.append(outWordToContexts_json) json_pdump(WordToContexts, outWordToContexts_json.open("w"), key=lambda x: len(x[1]), reverse=True) if create_ContextToWords: outputfilelist.append(outContextToWords_json) json_pdump(ContextToWords, outContextToWords_json.open("w"), key=lambda x: len(x[1]), reverse=True) stdout_list("Output files:", *outputfilelist)