Exemplo n.º 1
0
def provide_corpus_functionwords(numOfFunctionwords):
    global top_corpus_functionwords
    top_corpus_functionwords={}

    synchronized_functionwords_file_path = Path("vectors_handling/vectors/synchronized_functionwords/synchronized_functionwords.txt")

    if not util.exists(synchronized_functionwords_file_path):  # can't find the file in memory
        log('Cannot find synchronized_functionwords file')  # redundant

        corpus_functionwords = {}

        for domain_dir in os.scandir(setup.database):
            if domain_dir.name == 'europe_data' and setup.domain == 'in':
                for country_dir in os.scandir(domain_dir):
                    country_name = str.split(os.path.basename(country_dir), '.')[1]
                    log('Counting function words in ' + country_name)
                    for user_dir in os.scandir(country_dir):
                        for file_dir in os.scandir(user_dir):
                            file = open(file_dir, "r", encoding="utf-8")
                            lines = file.readlines()
                            for line in lines:
                                words = line.split()
                                for word in words:
                                    if word in function_words_map.keys():
                                        if word not in corpus_functionwords.keys():
                                            corpus_functionwords[word] = 1
                                        else:
                                            corpus_functionwords[word] += 1

        top_corpus_functionwords = heapq.nlargest(numOfFunctionwords, corpus_functionwords,
                                                  key=corpus_functionwords.get)
        util.save_file(synchronized_functionwords_file_path, top_corpus_functionwords)

    top_corpus_functionwords = util.load_file(synchronized_functionwords_file_path)
    return top_corpus_functionwords
Exemplo n.º 2
0
def provide_function_words_map():
    function_words_file_path = Path("vectors_handling/vectors/functionwords/function_words.txt")
    function_words = util.load_file(function_words_file_path)
    global function_words_map
    function_words_map={}

    for index in range(len(function_words)):
        function_words_map[heappop(function_words)] = index
Exemplo n.º 3
0
def provide_top_spelling_errors():
    spelling_file_path = Path("vectors_handling/vectors/spelling_errors/top_spelling_errors.txt")

    if not util.exists(spelling_file_path):  # can't find the file in memory
        log('Cannot find top bipos file')  # redundant
        generate_top_spelling_errors(spelling_file_path)

    top_spelling_errors = util.load_file(spelling_file_path)
    return top_spelling_errors
Exemplo n.º 4
0
def provide_top_unigram():
    unigram_file_path = Path("vectors_handling/vectors/unigrams/top_unigrams.txt")

    if not util.exists(unigram_file_path): # can't find the file in memory
        log('Cannot find top unigrams file') # redundant
        generate_top_unigrams(unigram_file_path)

    top_unigrams = util.load_file(unigram_file_path)
    return top_unigrams
Exemplo n.º 5
0
def provide_top_tripos():
    tripos_file_path = Path("vectors_handling/vectors/pos/top_tripos.txt")

    if not util.exists(tripos_file_path):  # can't find the file in memory
        log('Cannot find top tripos file')  # redundant
        generate_top_tripos(tripos_file_path)

    top_tripos = util.load_file(tripos_file_path)
    return top_tripos
Exemplo n.º 6
0
def get_database():
    global database
    database = util.load_file(
        'utilities/database_dir.txt')[0] + util.FeatureToDirectory[feature]