Exemplo n.º 1
0
    def analysis(self):
        query = request.params['query']

        if query.strip() == "":
            return render('/index.mako')

        conn = solr.SolrConnection('http://localhost:8983/solr')

        # Grab polarized data from Solr
        params = {'q': sanitize(query) + " sentiment:[0.0 TO 1.0]", 'rows': 10}
        good_results = conn.query(**(params))
        params = {
            'q': sanitize(query) + " sentiment:[-1.0 TO 0.0]",
            'rows': 10
        }
        bad_results = conn.query(**(params))

        # Grab data from Solr
        params = {'q': sanitize(query), 'rows': 500, 'start': 0}
        results = conn.query(**(params))
        conn.close()

        # Do ngrams analysis
        goodText = ''.join(hit['content'] for hit in good_results.results)
        badText = ''.join(hit['content'] for hit in bad_results.results)
        c.goodTerms, c.badTerms = ngrams.main(goodText, badText,
                                              ngrams.getWordsForDisplay)

        c.goodCount = good_results.numFound
        c.badCount = bad_results.numFound

        c.goodResults = good_results
        c.badResults = bad_results

        c.query = query

        if len(results) == 0:
            return render('/noresults.mako')

        c.results = results
        c.service = "analysis"
        return render('/analysis.mako')
Exemplo n.º 2
0
 def getBlogPost(self, filter):
     conn = solr.SolrConnection('http://localhost:8983/solr')
     print filter
     parsedFilter = SearchFilter()
     parsedFilter.decode(filter)
     params = parsedFilter.toSolr()
     params['q'] = params['q'] +  " sentiment:[0.75 TO 1.0]"
     good_response = conn.query(**(params))
     params = parsedFilter.toSolr()
     params['q'] = params['q'] +  " sentiment:[0.0 TO 0.25]"
     bad_response = conn.query(**(params))
     conn.close()
     goodText = ""
     badText = ""
     results = good_response.results + bad_response.results
     for hit in results:
         if hit['sentiment'] == True:
             goodText += hit['content']
         elif hit['sentiment'] == False:
             badText += hit['content']
     q, p = ngrams.main(goodText, badText, ngrams.getWordsForDisplay)
     return (q, p, [(hit['title'], hit['content'], hit['sentiment']) for hit in results if hit['content']])
Exemplo n.º 3
0
    def polarize(self):
        c.service = "polarize"

        query = request.params['query']

        if query.strip() == "":
            return render('/index.mako')

        conn = solr.SolrConnection('http://localhost:8983/solr')

        # Grab data from Solr
        params = {'q': sanitize(query) + " sentiment:[0.0 TO 1.0]", 'rows': 10}
        good_results = conn.query(**(params))
        params = {
            'q': sanitize(query) + " sentiment:[-1.0 TO 0.0]",
            'rows': 10
        }
        bad_results = conn.query(**(params))
        conn.close()

        if not (good_results or bad_results):
            c.query = query
            return render('/noresults.mako')

        # Do ngrams analysis
        goodText = ''.join(hit['content'] for hit in good_results.results)
        badText = ''.join(hit['content'] for hit in bad_results.results)
        q, p = ngrams.main(goodText, badText, ngrams.getWordsForDisplay)

        # Send params to context
        c.goodTerms = q  #[i[0] for i in q]
        c.badTerms = p  #[i[0] for i in p]

        c.goodResults = good_results
        c.badResults = bad_results

        c.query = query

        return render('/polarize.mako')
Exemplo n.º 4
0
    def analysis(self):
        query = request.params['query']

        if query.strip() == "":
          return render('/index.mako')

        conn = solr.SolrConnection('http://localhost:8983/solr')

        # Grab polarized data from Solr
        params = {'q': sanitize(query) + " sentiment:[0.0 TO 1.0]", 'rows' : 10}
        good_results = conn.query(**(params))
        params = {'q': sanitize(query) + " sentiment:[-1.0 TO 0.0]", 'rows' : 10}
        bad_results = conn.query(**(params))

        # Grab data from Solr
        params = {'q': sanitize(query), 'rows' : 500, 'start': 0}
        results = conn.query(**(params))
        conn.close()

        # Do ngrams analysis
        goodText = ''.join(hit['content'] for hit in good_results.results)
        badText = ''.join(hit['content'] for hit in bad_results.results)
        c.goodTerms, c.badTerms = ngrams.main(goodText, badText, ngrams.getWordsForDisplay)

        c.goodCount = good_results.numFound
        c.badCount = bad_results.numFound

        c.goodResults = good_results
        c.badResults = bad_results

        c.query = query

        if len(results) == 0:
          return render('/noresults.mako')

        c.results = results
        c.service = "analysis"
        return render('/analysis.mako')
Exemplo n.º 5
0
 def getBlogPost(self, filter):
     conn = solr.SolrConnection('http://localhost:8983/solr')
     print filter
     parsedFilter = SearchFilter()
     parsedFilter.decode(filter)
     params = parsedFilter.toSolr()
     params['q'] = params['q'] + " sentiment:[0.75 TO 1.0]"
     good_response = conn.query(**(params))
     params = parsedFilter.toSolr()
     params['q'] = params['q'] + " sentiment:[0.0 TO 0.25]"
     bad_response = conn.query(**(params))
     conn.close()
     goodText = ""
     badText = ""
     results = good_response.results + bad_response.results
     for hit in results:
         if hit['sentiment'] == True:
             goodText += hit['content']
         elif hit['sentiment'] == False:
             badText += hit['content']
     q, p = ngrams.main(goodText, badText, ngrams.getWordsForDisplay)
     return (q, p, [(hit['title'], hit['content'], hit['sentiment'])
                    for hit in results if hit['content']])
Exemplo n.º 6
0
    def polarize(self):
        c.service = "polarize"

        query = request.params['query']

        if query.strip() == "":
          return render('/index.mako')

        conn = solr.SolrConnection('http://localhost:8983/solr')

        # Grab data from Solr
        params = {'q': sanitize(query) + " sentiment:[0.0 TO 1.0]", 'rows' : 10}
        good_results = conn.query(**(params))
        params = {'q': sanitize(query) + " sentiment:[-1.0 TO 0.0]", 'rows' : 10}
        bad_results = conn.query(**(params))
        conn.close()

        if not (good_results or bad_results):
          c.query = query
          return render('/noresults.mako')

        # Do ngrams analysis
        goodText = ''.join(hit['content'] for hit in good_results.results)
        badText = ''.join(hit['content'] for hit in bad_results.results)
        q, p = ngrams.main(goodText, badText, ngrams.getWordsForDisplay)

        # Send params to context
        c.goodTerms = q #[i[0] for i in q]
        c.badTerms = p #[i[0] for i in p]

        c.goodResults = good_results
        c.badResults = bad_results

        c.query = query

        return render('/polarize.mako')
Exemplo n.º 7
0
def main(language=None, corpus=None, datafolder=None, filename=None,
         maxwordtokens=0, use_corpus=True):

    print("\n*****************************************************\n"
          "Running the phon.py program now...\n")

    infilename, corpusName = get_wordlist_path_corpus_stem(language, corpus,
                                datafolder, filename, maxwordtokens, use_corpus)

    if not infilename.exists():
        if use_corpus:
            if maxwordtokens:
                warning = " ({} tokens)".format(maxwordtokens)
            else:
                warning = ""
            print("\nWordlist for {}{} not found.\n"
                  "ngrams.py is now run.\n".format(corpus, warning))
            ngrams.main(language=language, corpus=corpus,
                        datafolder=datafolder, filename=filename,
                        maxwordtokens=maxwordtokens)
        else:
            sys.exit("\nThe specified wordlist ""\n"
                     "is not found.".format(infilename))

    if filename:
        outfolder = Path(Path(filename).parent, "phon")
    else:
        outfolder = Path(datafolder, language, 'phon')

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    outfilenamePhones = Path(outfolder, corpusName + "_phones.txt")
    outfilenameBiphones = Path(outfolder, corpusName + "_biphones.txt")
    outfilenameTriphones = Path(outfolder, corpusName + "_triphones.txt")

    phoneDict = Counter()
    triphoneDict = Counter()
    biphoneDict = Counter()
    sep = "\t"

    print('Reading the wordlist file now...')

    with infilename.open() as f:
        lines = f.readlines()

        for line in lines:
            if not line or line.startswith("#"):
                continue

            line = line.strip().casefold()

            phones, *rest = line.split()

            try:
                freq = int(rest[0])
            except (ValueError, IndexError):
                freq = 1

            phones = "#{}#".format(phones) # add word boundaries
            lenPhones = len(phones)

            for i in range(lenPhones-2):

                phone1 = phones[i]
                phone2 = phones[i+1]
                phone3 = phones[i+2]

                phoneDict[phone3] += freq

                if i == 0:
                    phoneDict[phone1] += freq
                    phoneDict[phone2] += freq
                    biphone = phone1 + sep + phone2
                    biphoneDict[biphone] += freq

                biphone = phone2 + sep + phone3
                triphone = phone1 + sep + phone2 + sep + phone3

                triphoneDict[triphone] += freq
                biphoneDict[biphone] += freq

    print("\nCompleted counting phones, biphones, and triphones.")

    intro_string = "# data source: {}".format(str(infilename))

    phonesSorted = sorted_alphabetized(phoneDict.items(),
                                       key=lambda x: x[1], reverse=True)

    biphonesSorted = sorted_alphabetized(biphoneDict.items(),
                                         key=lambda x: x[1], reverse=True)

    triphonesSorted = sorted_alphabetized(triphoneDict.items(),
                                          key=lambda x: x[1], reverse=True)

    #--------------------------------------------------------------------------#
    # generate .txt output files
    #--------------------------------------------------------------------------#

    with outfilenamePhones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(phonesSorted)), file=f)
        print("# token count: {}".format(str(sum(phoneDict.values()))), file=f)
        for (phone, freq) in phonesSorted:
            print(phone + sep + str(freq), file=f)

    with outfilenameBiphones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(biphonesSorted)), file=f)
        print("# token count: {}".format(str(sum(biphoneDict.values()))),
                                                                        file=f)
        for (biphone, freq) in biphonesSorted:
            print(biphone + sep +  str(freq), file=f)

    with outfilenameTriphones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(triphonesSorted)), file=f)
        print("# token count: {}".format(str(sum(triphoneDict.values()))),
                                                                        file=f)
        for (triphone, freq) in triphonesSorted:
            print(triphone + sep + str(freq), file=f)

    #--------------------------------------------------------------------------#
    # generate .json output files
    #--------------------------------------------------------------------------#

    outfilenamePhones_json = changeFilenameSuffix(outfilenamePhones, '.json')
    with outfilenamePhones_json.open('w') as f:
        json_pdump(phoneDict, f, key=lambda x:x[1], reverse=True)

    outfilenameBiphones_json = changeFilenameSuffix(outfilenameBiphones, '.json')
    with outfilenameBiphones_json.open('w') as f:
        json_pdump(biphoneDict, f, key=lambda x:x[1], reverse=True)

    outfilenameTriphones_json = changeFilenameSuffix(outfilenameTriphones, '.json')
    with outfilenameTriphones_json.open('w') as f:
        json_pdump(triphoneDict, f, key=lambda x:x[1], reverse=True)

    print('phone, biphone and triphone files ready')

    stdout_list("Output files:",
        outfilenamePhones, outfilenameBiphones, outfilenameTriphones,
        outfilenamePhones_json, outfilenameBiphones_json, outfilenameTriphones_json)
Exemplo n.º 8
0
def main(language=None,
         corpus=None,
         datafolder=None,
         filename=None,
         maxwordtokens=0,
         use_corpus=True):

    print("\n*****************************************************\n"
          "Running the phon.py program now...\n")

    infilename, corpusName = get_wordlist_path_corpus_stem(
        language, corpus, datafolder, filename, maxwordtokens, use_corpus)

    if not infilename.exists():
        if use_corpus:
            if maxwordtokens:
                warning = " ({} tokens)".format(maxwordtokens)
            else:
                warning = ""
            print("\nWordlist for {}{} not found.\n"
                  "ngrams.py is now run.\n".format(corpus, warning))
            ngrams.main(language=language,
                        corpus=corpus,
                        datafolder=datafolder,
                        filename=filename,
                        maxwordtokens=maxwordtokens)
        else:
            sys.exit("\nThe specified wordlist "
                     "\n"
                     "is not found.".format(infilename))

    if filename:
        outfolder = Path(Path(filename).parent, "phon")
    else:
        outfolder = Path(datafolder, language, 'phon')

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    outfilenamePhones = Path(outfolder, corpusName + "_phones.txt")
    outfilenameBiphones = Path(outfolder, corpusName + "_biphones.txt")
    outfilenameTriphones = Path(outfolder, corpusName + "_triphones.txt")

    phoneDict = Counter()
    triphoneDict = Counter()
    biphoneDict = Counter()
    sep = "\t"

    print('Reading the wordlist file now...')

    with infilename.open() as f:
        lines = f.readlines()

        for line in lines:
            if not line or line.startswith("#"):
                continue

            line = line.strip().casefold()

            phones, *rest = line.split()

            try:
                freq = int(rest[0])
            except (ValueError, IndexError):
                freq = 1

            phones = "#{}#".format(phones)  # add word boundaries
            lenPhones = len(phones)

            for i in range(lenPhones - 2):

                phone1 = phones[i]
                phone2 = phones[i + 1]
                phone3 = phones[i + 2]

                phoneDict[phone3] += freq

                if i == 0:
                    phoneDict[phone1] += freq
                    phoneDict[phone2] += freq
                    biphone = phone1 + sep + phone2
                    biphoneDict[biphone] += freq

                biphone = phone2 + sep + phone3
                triphone = phone1 + sep + phone2 + sep + phone3

                triphoneDict[triphone] += freq
                biphoneDict[biphone] += freq

    print("\nCompleted counting phones, biphones, and triphones.")

    intro_string = "# data source: {}".format(str(infilename))

    phonesSorted = sorted_alphabetized(phoneDict.items(),
                                       key=lambda x: x[1],
                                       reverse=True)

    biphonesSorted = sorted_alphabetized(biphoneDict.items(),
                                         key=lambda x: x[1],
                                         reverse=True)

    triphonesSorted = sorted_alphabetized(triphoneDict.items(),
                                          key=lambda x: x[1],
                                          reverse=True)

    #--------------------------------------------------------------------------#
    # generate .txt output files
    #--------------------------------------------------------------------------#

    with outfilenamePhones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(phonesSorted)), file=f)
        print("# token count: {}".format(str(sum(phoneDict.values()))), file=f)
        for (phone, freq) in phonesSorted:
            print(phone + sep + str(freq), file=f)

    with outfilenameBiphones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(biphonesSorted)), file=f)
        print("# token count: {}".format(str(sum(biphoneDict.values()))),
              file=f)
        for (biphone, freq) in biphonesSorted:
            print(biphone + sep + str(freq), file=f)

    with outfilenameTriphones.open('w') as f:
        print(intro_string, file=f)
        print("# type count: {}".format(len(triphonesSorted)), file=f)
        print("# token count: {}".format(str(sum(triphoneDict.values()))),
              file=f)
        for (triphone, freq) in triphonesSorted:
            print(triphone + sep + str(freq), file=f)

    #--------------------------------------------------------------------------#
    # generate .json output files
    #--------------------------------------------------------------------------#

    outfilenamePhones_json = changeFilenameSuffix(outfilenamePhones, '.json')
    with outfilenamePhones_json.open('w') as f:
        json_pdump(phoneDict, f, key=lambda x: x[1], reverse=True)

    outfilenameBiphones_json = changeFilenameSuffix(outfilenameBiphones,
                                                    '.json')
    with outfilenameBiphones_json.open('w') as f:
        json_pdump(biphoneDict, f, key=lambda x: x[1], reverse=True)

    outfilenameTriphones_json = changeFilenameSuffix(outfilenameTriphones,
                                                     '.json')
    with outfilenameTriphones_json.open('w') as f:
        json_pdump(triphoneDict, f, key=lambda x: x[1], reverse=True)

    print('phone, biphone and triphone files ready')

    stdout_list("Output files:", outfilenamePhones, outfilenameBiphones,
                outfilenameTriphones, outfilenamePhones_json,
                outfilenameBiphones_json, outfilenameTriphones_json)
Exemplo n.º 9
0
def main(language=None,
         corpus=None,
         datafolder=None,
         filename=None,
         maxwordtypes=1000,
         nNeighbors=9,
         nEigenvectors=11,
         create_WordToContexts=False,
         create_ContextToWords=False,
         mincontexts=3,
         usesigtransforms=True):

    print("\n*****************************************************\n"
          "Running the manifold.py program now...\n")

    if filename:
        corpusStem = Path(filename).stem
        infolder = Path(Path(filename).parent, 'ngrams')
        outfolder = Path(Path(filename).parent, 'neighbors')
        outcontextsfolder = Path(Path(filename).parent, 'word_contexts')
    else:
        corpusStem = Path(corpus).stem
        infolder = Path(datafolder, language, 'ngrams')
        outfolder = Path(datafolder, language, 'neighbors')
        outcontextsfolder = Path(datafolder, language, 'word_contexts')

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    if not outcontextsfolder.exists():
        outcontextsfolder.mkdir(parents=True)

    infileWordsname = Path(infolder, corpusStem + '_words.txt')
    infileBigramsname = Path(infolder, corpusStem + '_bigrams.txt')
    infileTrigramsname = Path(infolder, corpusStem + '_trigrams.txt')

    if (not infileWordsname.exists()) or \
       (not infileBigramsname.exists()) or \
       (not infileTrigramsname.exists()):
        print("Error in locating n-gram data files.\n"
              "The program now creates them.\n")
        ngrams.main(language=language,
                    corpus=corpus,
                    datafolder=datafolder,
                    filename=filename)

    if usesigtransforms:
        if filename:
            infolderlxa = Path(Path(filename).parent, 'lxa')
        else:
            infolderlxa = Path(datafolder, language, 'lxa')
        sigtransform_json_fname = Path(
            infolderlxa, corpusStem + "_WordToSigtransforms.json")
        try:
            WordToSigtransforms = json_pload(sigtransform_json_fname.open())
        except FileNotFoundError:
            print("The file \"{}\" is not found.\n"
                  "The program now creates it.\n".format(
                      sigtransform_json_fname))
            lxa5.main(language=language,
                      corpus=corpus,
                      datafolder=datafolder,
                      filename=filename)
            WordToSigtransforms = json_pload(sigtransform_json_fname.open())

    # WordToSigtransforms just read into the program; to be used soon...

    print('Reading word list...', flush=True)
    mywords = GetMyWords(infileWordsname, corpus)

    print("Word file is", infileWordsname, flush=True)
    print("Number of neighbors to find for each word type: ", nNeighbors)
    print('Corpus has', len(mywords), 'word types', flush=True)

    lenMywords = len(mywords)
    if lenMywords > maxwordtypes:
        nWordsForAnalysis = maxwordtypes
    else:
        nWordsForAnalysis = lenMywords
    print('number of words for analysis adjusted to', nWordsForAnalysis)

    analyzedwordlist = list(mywords.keys())[:nWordsForAnalysis]
    worddict = {w: analyzedwordlist.index(w) for w in analyzedwordlist}

    corpusName = corpusStem + '_' + str(nWordsForAnalysis) + '_' + str(
        nNeighbors)

    outfilenameNeighbors = Path(outfolder, corpusName + "_neighbors.txt")

    outfilenameSharedcontexts = Path(outfolder, corpusName + \
                                "_shared_contexts.txt")

    outfilenameNeighborGraph = Path(outfolder, corpusName + "_neighbors.gexf")

    outfilenameImportantContextToWords = Path(outfolder, corpusName + \
                                              "_ImportantContextToWords.txt")

    outWordToContexts_json = Path(outcontextsfolder, corpusName + \
                                       "_WordToContexts.json")

    outContextToWords_json = Path(outcontextsfolder, corpusName + \
                                       "_ContextToWords.json")

    print("Reading bigrams/trigrams and computing context array...",
          flush=True)

    context_array, contextdict, \
    WordToContexts, ContextToWords = GetContextArray(nWordsForAnalysis,
        worddict, infileBigramsname, infileTrigramsname, mincontexts)

    print("Computing shared context master matrix...", flush=True)
    CountOfSharedContexts = context_array.dot(context_array.T).todense()
    del context_array

    print("Computing diameter...", flush=True)
    Diameter = Normalize(nWordsForAnalysis, CountOfSharedContexts)

    print("Computing incidence graph...", flush=True)
    incidencegraph = compute_incidence_graph(nWordsForAnalysis, Diameter,
                                             CountOfSharedContexts)
    del CountOfSharedContexts

    print("Computing mylaplacian...", flush=True)
    mylaplacian = compute_laplacian(nWordsForAnalysis, Diameter,
                                    incidencegraph)
    del Diameter
    del incidencegraph

    print("Computing eigenvectors...", flush=True)
    myeigenvalues, myeigenvectors = GetEigenvectors(mylaplacian)
    del mylaplacian
    del myeigenvalues

    print('Computing distances between words...', flush=True)
    # take first N columns of eigenvector matrix
    coordinates = myeigenvectors[:, :nEigenvectors]
    wordsdistance = compute_words_distance(nWordsForAnalysis, coordinates)
    del coordinates

    print('Computing nearest neighbors now... ', flush=True)
    closestNeighbors = compute_closest_neighbors(wordsdistance, nNeighbors)

    WordToNeighbors_by_str = OrderedDict()
    WordToNeighbors = dict()

    for wordno in range(nWordsForAnalysis):
        line = closestNeighbors[wordno]
        word_idx, neighbors_idx = line[0], line[1:]
        word = analyzedwordlist[word_idx]
        neighbors = [analyzedwordlist[idx] for idx in neighbors_idx]
        WordToNeighbors_by_str[word] = neighbors
        WordToNeighbors[word_idx] = neighbors_idx

    del closestNeighbors

    with outfilenameNeighbors.open('w') as f:
        print("# language: {}\n# corpus: {}\n"
              "# Number of word types analyzed: {}\n"
              "# Number of neighbors: {}\n".format(language, corpus,
                                                   nWordsForAnalysis,
                                                   nNeighbors),
              file=f)

        for word, neighbors in WordToNeighbors_by_str.items():
            print(word, " ".join(neighbors), file=f)

    neighbor_graph = GetMyGraph(WordToNeighbors_by_str)

    # output manifold as gexf data file
    nx.write_gexf(neighbor_graph, str(outfilenameNeighborGraph))

    # output manifold as json for d3 visualization
    manifold_json_data = json_graph.node_link_data(neighbor_graph)
    outfilenameManifoldJson = Path(outfolder, corpusName + "_manifold.json")
    json.dump(manifold_json_data, outfilenameManifoldJson.open("w"), indent=2)

    WordToNeighbors_json = changeFilenameSuffix(outfilenameNeighbors, ".json")
    json_pdump(WordToNeighbors_by_str,
               WordToNeighbors_json.open("w"),
               asis=True)

    print("Computing shared contexts among neighbors...", flush=True)
    WordToSharedContextsOfNeighbors, \
    ImportantContextToWords = compute_WordToSharedContextsOfNeighbors(
                                        nWordsForAnalysis, WordToContexts,
                                        WordToNeighbors, ContextToWords,
                                        nNeighbors, mincontexts)

    output_WordToSharedContextsOfNeighbors(outfilenameSharedcontexts,
                                           WordToSharedContextsOfNeighbors,
                                           worddict, contextdict,
                                           nWordsForAnalysis)

    output_ImportantContextToWords(outfilenameImportantContextToWords,
                                   ImportantContextToWords, contextdict,
                                   worddict)

    outputfilelist = [
        outfilenameNeighbors, outfilenameNeighborGraph, WordToNeighbors_json,
        outfilenameSharedcontexts, outfilenameImportantContextToWords,
        outfilenameManifoldJson
    ]

    if create_WordToContexts:
        outputfilelist.append(outWordToContexts_json)
        json_pdump(WordToContexts,
                   outWordToContexts_json.open("w"),
                   key=lambda x: len(x[1]),
                   reverse=True)

    if create_ContextToWords:
        outputfilelist.append(outContextToWords_json)
        json_pdump(ContextToWords,
                   outContextToWords_json.open("w"),
                   key=lambda x: len(x[1]),
                   reverse=True)

    stdout_list("Output files:", *outputfilelist)
Exemplo n.º 10
0
def main(language=None, corpus=None, datafolder=None, filename=None,
         MinimumStemLength=4, MinimumAffixLength=1, SF_threshold=3,
         maxwordtokens=0, use_corpus=True):

    print("\n*****************************************************\n"
          "Running the tries.py program now...\n")

    #--------------------------------------------------------------------##
    #        read wordlist
    #--------------------------------------------------------------------##

    print("reading wordlist...", flush=True)

    wordlist_path, corpusName = get_wordlist_path_corpus_stem(language, corpus,
                                datafolder, filename, maxwordtokens, use_corpus)

    print("wordlist file path:\n{}\n".format(wordlist_path))

    if not wordlist_path.exists():
        if use_corpus:
            if maxwordtokens:
                warning = " ({} tokens)".format(maxwordtokens)
            else:
                warning = ""
            print("\nWordlist for {}{} not found.\n"
                  "ngrams.py is now run.\n".format(corpus, warning))
            ngrams.main(language=language, corpus=corpus,
                        datafolder=datafolder, filename=filename,
                        maxwordtokens=maxwordtokens)
        else:
            sys.exit("\nThe specified wordlist ""\n"
                     "is not found.".format(wordlist_path))

    wordFreqDict = read_word_freq(wordlist_path)
    wordlist = sorted(wordFreqDict.keys())
    reversedwordlist = sorted([x[::-1] for x in wordlist])

    #--------------------------------------------------------------------##
    #        output settings
    #--------------------------------------------------------------------##

    if filename:
        outfolder = Path(Path(filename).parent, "tries")
    else:
        outfolder = Path(datafolder, language, "tries")

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    outfile_SF_name = Path(outfolder, corpusName + "_SF.txt")
    outfile_trieLtoR_name = Path(outfolder, corpusName + "_trieLtoR.txt")
     
    outfile_trieRtoL_name = Path(outfolder, corpusName + "_trieRtoL.txt")
    outfile_PF_name = Path(outfolder, corpusName + "_PF.txt")

    outfile_Signatures_name = Path(outfolder, corpusName + "_Signatures.txt")

    #--------------------------------------------------------------------##
    #        Find breaks in words (left-to-right and right-to-left)
    #--------------------------------------------------------------------##

    print("finding breaks in words...", flush=True)

    breaks_LtoR = findBreaksInWords(wordlist, MinimumStemLength)
    breaks_RtoL = findBreaksInWords(reversedwordlist, MinimumStemLength)

    #--------------------------------------------------------------------##
    #        Break up each word (left-to-right and right-to-left)
    #--------------------------------------------------------------------##

    WordsBrokenLtoR = BreakUpEachWord(wordlist, breaks_LtoR)
    WordsBrokenRtoL = BreakUpEachWord(reversedwordlist, breaks_RtoL)

    #--------------------------------------------------------------------------#
    #        Compute successors and predecessors
    #--------------------------------------------------------------------------# 

    print("computing successors and predecessors...", flush=True)

    successors = GetSuccessors(wordlist, WordsBrokenLtoR)
    OutputSuccessors(outfile_SF_name, successors, SF_threshold)

    predecessors = GetSuccessors(reversedwordlist, WordsBrokenRtoL)
    OutputSuccessors(outfile_PF_name, predecessors, SF_threshold, reverse=True)

    outfile_SF_name_json = changeFilenameSuffix(outfile_SF_name, ".json")
    json_pdump(successors, outfile_SF_name_json.open("w"))

    outfile_PF_name_json = changeFilenameSuffix(outfile_PF_name, ".json")
    json_pdump(predecessors, outfile_PF_name_json.open("w"))

    print("printing signatures...", flush=True)
    OutputSignatures1(outfile_Signatures_name, successors)

    #--------------------------------------------------------------------------#
    #        Print tries (left-to-right, right-to-left)
    #--------------------------------------------------------------------------# 

    print("printing tries...", flush=True)

    OutputTrie(outfile_trieLtoR_name, wordlist, WordsBrokenLtoR)
    OutputTrie(outfile_trieRtoL_name, reversedwordlist, WordsBrokenRtoL, reverse=True)

    outfile_trieLtoR_name_json = changeFilenameSuffix(outfile_trieLtoR_name, ".json")
    json_pdump(WordsBrokenLtoR, outfile_trieLtoR_name_json.open("w"))

    outfile_trieRtoL_name_json = changeFilenameSuffix(outfile_trieRtoL_name, ".json")
    json_pdump(WordsBrokenRtoL, outfile_trieRtoL_name_json.open("w"))

    stdout_list("Output files:", outfile_SF_name, outfile_PF_name,
                                 outfile_trieLtoR_name, outfile_trieRtoL_name,
                                 outfile_Signatures_name,
                                 outfile_SF_name_json, outfile_PF_name_json,
                                 outfile_trieLtoR_name_json,
                                 outfile_trieRtoL_name_json)
Exemplo n.º 11
0
        try:
            goodText += hit['content']
        except Exception , e:
            print 'good_results error', e
            pass
    for hit in bad_results:
        try:
            badText += hit['content']
        except Exception , e:
            print 'bad_results error', e
            pass

    #goodText = ''.join(hit['content'] for hit in good_results)
    #badText = ''.join(hit['content'] for hit in bad_results)

    c.goodTerms, c.badTerms = ngrams.main(goodText, badText,
        ngrams.getWordsForDisplay)

    c.goodResults = good_results
    c.neutralResults = neutral_results
    c.badResults = bad_results

    return render('/analysis.mako')

  def custom(self):
    c.service = "search"
    c.query = request.params['query']

    if c.query.strip() == "":
      return render('/index.mako')

    self.startAsyncSearch(c.query)
Exemplo n.º 12
0
def main(language=None,
         corpus=None,
         datafolder=None,
         filename=None,
         MinimumStemLength=4,
         MinimumAffixLength=1,
         SF_threshold=3,
         maxwordtokens=0,
         use_corpus=True):

    print("\n*****************************************************\n"
          "Running the tries.py program now...\n")

    #--------------------------------------------------------------------##
    #        read wordlist
    #--------------------------------------------------------------------##

    print("reading wordlist...", flush=True)

    wordlist_path, corpusName = get_wordlist_path_corpus_stem(
        language, corpus, datafolder, filename, maxwordtokens, use_corpus)

    print("wordlist file path:\n{}\n".format(wordlist_path))

    if not wordlist_path.exists():
        if use_corpus:
            if maxwordtokens:
                warning = " ({} tokens)".format(maxwordtokens)
            else:
                warning = ""
            print("\nWordlist for {}{} not found.\n"
                  "ngrams.py is now run.\n".format(corpus, warning))
            ngrams.main(language=language,
                        corpus=corpus,
                        datafolder=datafolder,
                        filename=filename,
                        maxwordtokens=maxwordtokens)
        else:
            sys.exit("\nThe specified wordlist "
                     "\n"
                     "is not found.".format(wordlist_path))

    wordFreqDict = read_word_freq(wordlist_path)
    wordlist = sorted(wordFreqDict.keys())
    reversedwordlist = sorted([x[::-1] for x in wordlist])

    #--------------------------------------------------------------------##
    #        output settings
    #--------------------------------------------------------------------##

    if filename:
        outfolder = Path(Path(filename).parent, "tries")
    else:
        outfolder = Path(datafolder, language, "tries")

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    outfile_SF_name = Path(outfolder, corpusName + "_SF.txt")
    outfile_trieLtoR_name = Path(outfolder, corpusName + "_trieLtoR.txt")

    outfile_trieRtoL_name = Path(outfolder, corpusName + "_trieRtoL.txt")
    outfile_PF_name = Path(outfolder, corpusName + "_PF.txt")

    outfile_Signatures_name = Path(outfolder, corpusName + "_Signatures.txt")

    #--------------------------------------------------------------------##
    #        Find breaks in words (left-to-right and right-to-left)
    #--------------------------------------------------------------------##

    print("finding breaks in words...", flush=True)

    breaks_LtoR = findBreaksInWords(wordlist, MinimumStemLength)
    breaks_RtoL = findBreaksInWords(reversedwordlist, MinimumStemLength)

    #--------------------------------------------------------------------##
    #        Break up each word (left-to-right and right-to-left)
    #--------------------------------------------------------------------##

    WordsBrokenLtoR = BreakUpEachWord(wordlist, breaks_LtoR)
    WordsBrokenRtoL = BreakUpEachWord(reversedwordlist, breaks_RtoL)

    #--------------------------------------------------------------------------#
    #        Compute successors and predecessors
    #--------------------------------------------------------------------------#

    print("computing successors and predecessors...", flush=True)

    successors = GetSuccessors(wordlist, WordsBrokenLtoR)
    OutputSuccessors(outfile_SF_name, successors, SF_threshold)

    predecessors = GetSuccessors(reversedwordlist, WordsBrokenRtoL)
    OutputSuccessors(outfile_PF_name, predecessors, SF_threshold, reverse=True)

    outfile_SF_name_json = changeFilenameSuffix(outfile_SF_name, ".json")
    json_pdump(successors, outfile_SF_name_json.open("w"))

    outfile_PF_name_json = changeFilenameSuffix(outfile_PF_name, ".json")
    json_pdump(predecessors, outfile_PF_name_json.open("w"))

    print("printing signatures...", flush=True)
    OutputSignatures1(outfile_Signatures_name, successors)

    #--------------------------------------------------------------------------#
    #        Print tries (left-to-right, right-to-left)
    #--------------------------------------------------------------------------#

    print("printing tries...", flush=True)

    OutputTrie(outfile_trieLtoR_name, wordlist, WordsBrokenLtoR)
    OutputTrie(outfile_trieRtoL_name,
               reversedwordlist,
               WordsBrokenRtoL,
               reverse=True)

    outfile_trieLtoR_name_json = changeFilenameSuffix(outfile_trieLtoR_name,
                                                      ".json")
    json_pdump(WordsBrokenLtoR, outfile_trieLtoR_name_json.open("w"))

    outfile_trieRtoL_name_json = changeFilenameSuffix(outfile_trieRtoL_name,
                                                      ".json")
    json_pdump(WordsBrokenRtoL, outfile_trieRtoL_name_json.open("w"))

    stdout_list("Output files:", outfile_SF_name, outfile_PF_name,
                outfile_trieLtoR_name, outfile_trieRtoL_name,
                outfile_Signatures_name, outfile_SF_name_json,
                outfile_PF_name_json, outfile_trieLtoR_name_json,
                outfile_trieRtoL_name_json)
Exemplo n.º 13
0
def main(language=None,
         corpus=None,
         datafolder=None,
         filename=None,
         MinimumStemLength=4,
         MaximumAffixLength=3,
         MinimumNumberofSigUses=5,
         maxwordtokens=0,
         use_corpus=True):

    print("\n*****************************************************\n"
          "Running the lxa5.py program now...\n")

    # -------------------------------------------------------------------------#
    #       decide suffixing or prefixing
    # -------------------------------------------------------------------------#

    suffix_languages = {
        "english", "french", "hungarian", "turkish", "russian", "german",
        "spanish", 'test'
    }
    prefix_languages = {"swahili"}

    if str(language).casefold() in prefix_languages:
        FindSuffixesFlag = False  # prefixal
    else:
        FindSuffixesFlag = True  # suffixal

    wordlist_path, corpus_stem = get_wordlist_path_corpus_stem(
        language, corpus, datafolder, filename, maxwordtokens, use_corpus)

    print("wordlist file path:\n{}\n".format(wordlist_path))

    if not wordlist_path.exists():
        if use_corpus:
            if maxwordtokens:
                warning = " ({} tokens)".format(maxwordtokens)
            else:
                warning = ""
            print("\nWordlist for {}{} not found.\n"
                  "ngrams.py is now run.\n".format(corpus, warning))
            ngrams.main(language=language,
                        corpus=corpus,
                        datafolder=datafolder,
                        filename=filename,
                        maxwordtokens=maxwordtokens)
        else:
            sys.exit("\nThe specified wordlist "
                     "\n"
                     "is not found.".format(wordlist_path))

    wordFreqDict = read_word_freq(wordlist_path)
    wordlist = sorted(wordFreqDict.keys())

    if filename:
        outfolder = Path(Path(filename).parent, "lxa")
    else:
        outfolder = Path(datafolder, language, 'lxa')

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    # TODO -- filenames not yet used in main()
    outfile_Signatures_name = str(outfolder) + corpus_stem + "_Signatures.txt"
    outfile_SigTransforms_name = str(
        outfolder) + corpus_stem + "_SigTransforms.txt"
    outfile_FSA_name = str(outfolder) + corpus_stem + "_FSA.txt"
    outfile_FSA_graphics_name = str(
        outfolder) + corpus_stem + "_FSA_graphics.png"

    # -------------------------------------------------------------------------#
    #   create: BisigToTuple
    #                  (key: tuple of bisig | value: set of (stem, word1, word2)
    #           StemToWords (key: stem | value: set of words)
    #           SigToStems  (key: tuple of sig | value: set of stems )
    #           StemToSig   (key: str of stem  | value: tuple of sig )
    #           WordToSigs  (key: str of word  | value: set of sigs )
    #           AffixToSigs (key: str of affix | value: set of sigs )
    # -------------------------------------------------------------------------#

    BisigToTuple = MakeBiSignatures(wordlist, MinimumStemLength,
                                    MaximumAffixLength, FindSuffixesFlag)
    print("BisigToTuple ready", flush=True)

    StemToWords = MakeStemToWords(BisigToTuple, MinimumNumberofSigUses)
    print("StemToWords ready", flush=True)

    SigToStems = MakeSigToStems(StemToWords, MaximumAffixLength,
                                MinimumNumberofSigUses, FindSuffixesFlag)
    print("SigToStems ready", flush=True)

    StemToSig = MakeStemToSig(SigToStems)
    print("StemToSig ready", flush=True)

    WordToSigs = MakeWordToSigs(StemToWords, StemToSig)
    print("WordToSigs ready", flush=True)

    WordToSigtransforms = MakeWordToSigtransforms(WordToSigs)
    print("WordToSigtransforms ready", flush=True)

    AffixToSigs = MakeAffixToSigs(SigToStems)
    print("AffixToSigs ready", flush=True)

    # -------------------------------------------------------------------------#
    #   generate graphs for several dicts
    # -------------------------------------------------------------------------#
    #    GenerateGraphFromDict(StemToWords, outfolder, 'StemToWords.gexf')
    #    GenerateGraphFromDict(SigToStems, outfolder, 'SigToStems.gexf')
    #    GenerateGraphFromDict(WordToSigs, outfolder, 'WordToSigs.gexf')
    #    GenerateGraphFromDict(StemToSig, outfolder, 'StemToSig.gexf')
    # -------------------------------------------------------------------------#

    # -------------------------------------------------------------------------#
    #      output stem file
    # -------------------------------------------------------------------------#

    stemfilename = Path(outfolder, '{}_StemToWords.txt'.format(corpus_stem))
    OutputLargeDict(stemfilename,
                    StemToWords,
                    key=lambda x: len(x[1]),
                    reverse=True,
                    min_cell_width=25,
                    howmanyperline=5)

    print('===> stem file generated:', stemfilename, flush=True)

    # -------------------------------------------------------------------------#
    #      output affix file
    # -------------------------------------------------------------------------#

    affixfilename = Path(outfolder, '{}_AffixToSigs.txt'.format(corpus_stem))
    OutputLargeDict(affixfilename,
                    AffixToSigs,
                    min_cell_width=25,
                    key=lambda x: len(x[1]),
                    reverse=True,
                    howmanyperline=5,
                    SignatureValues=True)
    print('===> affix file generated:', affixfilename, flush=True)

    # -------------------------------------------------------------------------#
    #   output SigToStems
    # -------------------------------------------------------------------------#

    SigToStems_outfilename = Path(outfolder, corpus_stem + "_SigToStems.txt")
    OutputLargeDict(SigToStems_outfilename,
                    SigToStems,
                    key=lambda x: len(x[1]),
                    reverse=True,
                    howmanyperline=5,
                    SignatureKeys=True)

    SigToStems_outfilename_json = changeFilenameSuffix(SigToStems_outfilename,
                                                       ".json")
    json_pdump(SigToStems,
               SigToStems_outfilename_json.open("w"),
               key=lambda x: len(x[1]),
               reverse=True)

    print('===> output file generated:', SigToStems_outfilename, flush=True)
    print('===> output file generated:',
          SigToStems_outfilename_json,
          flush=True)

    # -------------------------------------------------------------------------#
    #   output WordToSigs
    # -------------------------------------------------------------------------#

    WordToSigs_outfilename = Path(outfolder, corpus_stem + "_WordToSigs.txt")
    OutputLargeDict(WordToSigs_outfilename,
                    WordToSigs,
                    key=lambda x: len(x[1]),
                    reverse=True,
                    min_cell_width=25,
                    SignatureValues=True)

    WordToSigs_outfilename_json = changeFilenameSuffix(WordToSigs_outfilename,
                                                       ".json")
    json_pdump(WordToSigs,
               WordToSigs_outfilename_json.open("w"),
               key=lambda x: len(x[1]),
               reverse=True)

    print('===> output file generated:', WordToSigs_outfilename, flush=True)
    print('===> output file generated:',
          WordToSigs_outfilename_json,
          flush=True)

    # -------------------------------------------------------------------------#
    #   output WordToSigtransforms
    # -------------------------------------------------------------------------#

    WordToSigtransforms_outfilename = Path(
        outfolder, corpus_stem + "_WordToSigtransforms.txt")
    OutputLargeDict(WordToSigtransforms_outfilename,
                    WordToSigtransforms,
                    min_cell_width=25,
                    sigtransforms=True,
                    key=lambda x: len(x[1]),
                    reverse=True)
    print('===> output file generated:',
          WordToSigtransforms_outfilename,
          flush=True)

    WordToSigtransforms_outfilename_json = changeFilenameSuffix(
        WordToSigtransforms_outfilename, ".json")
    json_pdump(WordToSigtransforms,
               WordToSigtransforms_outfilename_json.open("w"),
               key=lambda x: len(x[1]),
               reverse=True)
    print('===> output file generated:',
          WordToSigtransforms_outfilename_json,
          flush=True)

    # -------------------------------------------------------------------------#
    #   output the most freq word types not in any induced paradigms {the, of..}
    # -------------------------------------------------------------------------#

    wordFreqDict_sorted = sorted_alphabetized(wordFreqDict.items(),
                                              key=lambda x: x[1],
                                              reverse=True)

    mostFreqWordsNotInSigs_outfilename = Path(
        outfolder, corpus_stem + "_mostFreqWordsNotInSigs.txt")

    with mostFreqWordsNotInSigs_outfilename.open('w') as f:
        for (word, freq) in wordFreqDict_sorted:
            if word not in WordToSigs:
                print(word, freq, file=f)
            else:
                break

    print('===> output file generated:',
          mostFreqWordsNotInSigs_outfilename,
          flush=True)

    # -------------------------------------------------------------------------#
    #   output the word types in induced paradigms
    # -------------------------------------------------------------------------#

    WordsInSigs_outfilename = Path(outfolder, corpus_stem + "_WordsInSigs.txt")

    with WordsInSigs_outfilename.open('w') as f:
        for (word, freq) in wordFreqDict_sorted:
            if word in WordToSigs:
                print(word, freq, file=f)

    print('===> output file generated:', WordsInSigs_outfilename, flush=True)

    # -------------------------------------------------------------------------#
    #   output the word types NOT in induced paradigms
    # -------------------------------------------------------------------------#

    WordsNotInSigs_outfilename = Path(outfolder,
                                      corpus_stem + "_WordsNotInSigs.txt")

    with WordsNotInSigs_outfilename.open('w') as f:
        for (word, freq) in wordFreqDict_sorted:
            if word not in WordToSigs:
                print(word, freq, file=f)

    print('===> output file generated:',
          WordsNotInSigs_outfilename,
          flush=True)
Exemplo n.º 14
0
            try:
                goodText += hit['content']
            except Exception, e:
                print 'good_results error', e
                pass
        for hit in bad_results:
            try:
                badText += hit['content']
            except Exception, e:
                print 'bad_results error', e
                pass

        #goodText = ''.join(hit['content'] for hit in good_results)
        #badText = ''.join(hit['content'] for hit in bad_results)

        c.goodTerms, c.badTerms = ngrams.main(goodText, badText,
                                              ngrams.getWordsForDisplay)

        c.goodResults = good_results
        c.neutralResults = neutral_results
        c.badResults = bad_results

        return render('/analysis.mako')

    def custom(self):
        c.service = "search"
        c.query = request.params['query']

        if c.query.strip() == "":
            return render('/index.mako')

        self.startAsyncSearch(c.query)
Exemplo n.º 15
0
def main(
    language=None,
    corpus=None,
    datafolder=None,
    filename=None,
    maxwordtypes=1000,
    nNeighbors=9,
    nEigenvectors=11,
    create_WordToContexts=False,
    create_ContextToWords=False,
    mincontexts=3,
    usesigtransforms=True,
):

    print("\n*****************************************************\n" "Running the manifold.py program now...\n")

    if filename:
        corpusStem = Path(filename).stem
        infolder = Path(Path(filename).parent, "ngrams")
        outfolder = Path(Path(filename).parent, "neighbors")
        outcontextsfolder = Path(Path(filename).parent, "word_contexts")
    else:
        corpusStem = Path(corpus).stem
        infolder = Path(datafolder, language, "ngrams")
        outfolder = Path(datafolder, language, "neighbors")
        outcontextsfolder = Path(datafolder, language, "word_contexts")

    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    if not outcontextsfolder.exists():
        outcontextsfolder.mkdir(parents=True)

    infileWordsname = Path(infolder, corpusStem + "_words.txt")
    infileBigramsname = Path(infolder, corpusStem + "_bigrams.txt")
    infileTrigramsname = Path(infolder, corpusStem + "_trigrams.txt")

    if (not infileWordsname.exists()) or (not infileBigramsname.exists()) or (not infileTrigramsname.exists()):
        print("Error in locating n-gram data files.\n" "The program now creates them.\n")
        ngrams.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename)

    if usesigtransforms:
        if filename:
            infolderlxa = Path(Path(filename).parent, "lxa")
        else:
            infolderlxa = Path(datafolder, language, "lxa")
        sigtransform_json_fname = Path(infolderlxa, corpusStem + "_WordToSigtransforms.json")
        try:
            WordToSigtransforms = json_pload(sigtransform_json_fname.open())
        except FileNotFoundError:
            print('The file "{}" is not found.\n' "The program now creates it.\n".format(sigtransform_json_fname))
            lxa5.main(language=language, corpus=corpus, datafolder=datafolder, filename=filename)
            WordToSigtransforms = json_pload(sigtransform_json_fname.open())

    # WordToSigtransforms just read into the program; to be used soon...

    print("Reading word list...", flush=True)
    mywords = GetMyWords(infileWordsname, corpus)

    print("Word file is", infileWordsname, flush=True)
    print("Number of neighbors to find for each word type: ", nNeighbors)
    print("Corpus has", len(mywords), "word types", flush=True)

    lenMywords = len(mywords)
    if lenMywords > maxwordtypes:
        nWordsForAnalysis = maxwordtypes
    else:
        nWordsForAnalysis = lenMywords
    print("number of words for analysis adjusted to", nWordsForAnalysis)

    analyzedwordlist = list(mywords.keys())[:nWordsForAnalysis]
    worddict = {w: analyzedwordlist.index(w) for w in analyzedwordlist}

    corpusName = corpusStem + "_" + str(nWordsForAnalysis) + "_" + str(nNeighbors)

    outfilenameNeighbors = Path(outfolder, corpusName + "_neighbors.txt")

    outfilenameSharedcontexts = Path(outfolder, corpusName + "_shared_contexts.txt")

    outfilenameNeighborGraph = Path(outfolder, corpusName + "_neighbors.gexf")

    outfilenameImportantContextToWords = Path(outfolder, corpusName + "_ImportantContextToWords.txt")

    outWordToContexts_json = Path(outcontextsfolder, corpusName + "_WordToContexts.json")

    outContextToWords_json = Path(outcontextsfolder, corpusName + "_ContextToWords.json")

    print("Reading bigrams/trigrams and computing context array...", flush=True)

    context_array, contextdict, WordToContexts, ContextToWords = GetContextArray(
        nWordsForAnalysis, worddict, infileBigramsname, infileTrigramsname, mincontexts
    )

    print("Computing shared context master matrix...", flush=True)
    CountOfSharedContexts = context_array.dot(context_array.T).todense()
    del context_array

    print("Computing diameter...", flush=True)
    Diameter = Normalize(nWordsForAnalysis, CountOfSharedContexts)

    print("Computing incidence graph...", flush=True)
    incidencegraph = compute_incidence_graph(nWordsForAnalysis, Diameter, CountOfSharedContexts)
    del CountOfSharedContexts

    print("Computing mylaplacian...", flush=True)
    mylaplacian = compute_laplacian(nWordsForAnalysis, Diameter, incidencegraph)
    del Diameter
    del incidencegraph

    print("Computing eigenvectors...", flush=True)
    myeigenvalues, myeigenvectors = GetEigenvectors(mylaplacian)
    del mylaplacian
    del myeigenvalues

    print("Computing distances between words...", flush=True)
    # take first N columns of eigenvector matrix
    coordinates = myeigenvectors[:, :nEigenvectors]
    wordsdistance = compute_words_distance(nWordsForAnalysis, coordinates)
    del coordinates

    print("Computing nearest neighbors now... ", flush=True)
    closestNeighbors = compute_closest_neighbors(wordsdistance, nNeighbors)

    WordToNeighbors_by_str = OrderedDict()
    WordToNeighbors = dict()

    for wordno in range(nWordsForAnalysis):
        line = closestNeighbors[wordno]
        word_idx, neighbors_idx = line[0], line[1:]
        word = analyzedwordlist[word_idx]
        neighbors = [analyzedwordlist[idx] for idx in neighbors_idx]
        WordToNeighbors_by_str[word] = neighbors
        WordToNeighbors[word_idx] = neighbors_idx

    del closestNeighbors

    with outfilenameNeighbors.open("w") as f:
        print(
            "# language: {}\n# corpus: {}\n"
            "# Number of word types analyzed: {}\n"
            "# Number of neighbors: {}\n".format(language, corpus, nWordsForAnalysis, nNeighbors),
            file=f,
        )

        for word, neighbors in WordToNeighbors_by_str.items():
            print(word, " ".join(neighbors), file=f)

    neighbor_graph = GetMyGraph(WordToNeighbors_by_str)

    # output manifold as gexf data file
    nx.write_gexf(neighbor_graph, str(outfilenameNeighborGraph))

    # output manifold as json for d3 visualization
    manifold_json_data = json_graph.node_link_data(neighbor_graph)
    outfilenameManifoldJson = Path(outfolder, corpusName + "_manifold.json")
    json.dump(manifold_json_data, outfilenameManifoldJson.open("w"), indent=2)

    WordToNeighbors_json = changeFilenameSuffix(outfilenameNeighbors, ".json")
    json_pdump(WordToNeighbors_by_str, WordToNeighbors_json.open("w"), asis=True)

    print("Computing shared contexts among neighbors...", flush=True)
    WordToSharedContextsOfNeighbors, ImportantContextToWords = compute_WordToSharedContextsOfNeighbors(
        nWordsForAnalysis, WordToContexts, WordToNeighbors, ContextToWords, nNeighbors, mincontexts
    )

    output_WordToSharedContextsOfNeighbors(
        outfilenameSharedcontexts, WordToSharedContextsOfNeighbors, worddict, contextdict, nWordsForAnalysis
    )

    output_ImportantContextToWords(outfilenameImportantContextToWords, ImportantContextToWords, contextdict, worddict)

    outputfilelist = [
        outfilenameNeighbors,
        outfilenameNeighborGraph,
        WordToNeighbors_json,
        outfilenameSharedcontexts,
        outfilenameImportantContextToWords,
        outfilenameManifoldJson,
    ]

    if create_WordToContexts:
        outputfilelist.append(outWordToContexts_json)
        json_pdump(WordToContexts, outWordToContexts_json.open("w"), key=lambda x: len(x[1]), reverse=True)

    if create_ContextToWords:
        outputfilelist.append(outContextToWords_json)
        json_pdump(ContextToWords, outContextToWords_json.open("w"), key=lambda x: len(x[1]), reverse=True)

    stdout_list("Output files:", *outputfilelist)