Пример #1
0
def generate_top_bipos(save_path):
    log('Generating top bichars')
    all_bipos = {}

    for domain_dir in os.scandir(setup.database):
        if domain_dir.name == 'europe_data' and setup.domain == 'in':

            for country_dir in os.scandir(domain_dir):
                country_name = str.split(os.path.basename(country_dir), '.')[1]
                log('Generating top bipos for ' + country_name)
                for user_dir in os.scandir(country_dir):

                    for file_dir in os.scandir(user_dir):
                        file = open(file_dir, "r", encoding="utf-8")
                        lines = file.readlines()

                        for line in lines:  # parse lines within chunk text
                            pos_tokens = re.split("'\), \('|'\), \(\"", line)
                            for i in range(len(pos_tokens) - 2):
                                bigram = ""
                                bigram = bigram + re.split("', '|\", '", pos_tokens[i])[1] + " "
                                bigram = bigram + re.split("', '|\", '", pos_tokens[i + 1])[1]
                                if bigram not in all_bipos.keys():
                                    all_bipos[bigram] = 1
                                else:
                                    all_bipos[bigram] += 1

    top_bipos = heapq.nlargest(300, all_bipos, key=all_bipos.get)  # fetch top 300 bipos
    util.save_file(save_path, top_bipos)
Пример #2
0
def provide_corpus_functionwords(numOfFunctionwords):
    global top_corpus_functionwords
    top_corpus_functionwords={}

    synchronized_functionwords_file_path = Path("vectors_handling/vectors/synchronized_functionwords/synchronized_functionwords.txt")

    if not util.exists(synchronized_functionwords_file_path):  # can't find the file in memory
        log('Cannot find synchronized_functionwords file')  # redundant

        corpus_functionwords = {}

        for domain_dir in os.scandir(setup.database):
            if domain_dir.name == 'europe_data' and setup.domain == 'in':
                for country_dir in os.scandir(domain_dir):
                    country_name = str.split(os.path.basename(country_dir), '.')[1]
                    log('Counting function words in ' + country_name)
                    for user_dir in os.scandir(country_dir):
                        for file_dir in os.scandir(user_dir):
                            file = open(file_dir, "r", encoding="utf-8")
                            lines = file.readlines()
                            for line in lines:
                                words = line.split()
                                for word in words:
                                    if word in function_words_map.keys():
                                        if word not in corpus_functionwords.keys():
                                            corpus_functionwords[word] = 1
                                        else:
                                            corpus_functionwords[word] += 1

        top_corpus_functionwords = heapq.nlargest(numOfFunctionwords, corpus_functionwords,
                                                  key=corpus_functionwords.get)
        util.save_file(synchronized_functionwords_file_path, top_corpus_functionwords)

    top_corpus_functionwords = util.load_file(synchronized_functionwords_file_path)
    return top_corpus_functionwords
Пример #3
0
def generate_top_unigrams(save_path):
    log('Generating top unigrams')
    all_unigrams = {}

    for domain_dir in os.scandir(setup.database):
        if domain_dir.name == 'europe_data' and setup.domain == 'in':

            for country_dir in os.scandir(domain_dir):
                country_name = str.split(os.path.basename(country_dir), '.')[1]
                log('Generating top unigrams for ' + country_name)
                for user_dir in os.scandir(country_dir):

                    for file_dir in os.scandir(user_dir):
                        file = open(file_dir, "r", encoding="utf-8")
                        lines = file.readlines()
                        for line in lines:
                            unigrams = line.split()
                            for token in unigrams:
                                if token not in all_unigrams.keys():
                                    all_unigrams[token] = 1
                                else:
                                    all_unigrams[token] += 1

    top_unigrams = heapq.nlargest(1000, all_unigrams, key=all_unigrams.get)  # fetch top 1000 unigrams
    util.save_file(save_path, top_unigrams)
Пример #4
0
def generate_top_trichars(save_path):
    log('Generating top trichars')
    all_trichars = {}

    for domain_dir in os.scandir(setup.database):
        if domain_dir.name == 'europe_data' and setup.domain == 'in':

            for country_dir in os.scandir(domain_dir):
                country_name = str.split(os.path.basename(country_dir), '.')[1]
                log('Generating top trichars for ' + country_name)
                for user_dir in os.scandir(country_dir):

                    for file_dir in os.scandir(user_dir):
                        file = open(file_dir, "r", encoding="utf-8")
                        lines = file.readlines()
                        for line in lines:  # parse lines within chunk text

                            if len(line) >= 11:
                                cur_char = 0
                                while cur_char < len(line):

                                    trichar = line[cur_char + 1] + line[cur_char + 4] + line[cur_char + 7]
                                    if trichar not in all_trichars.keys():
                                        all_trichars[trichar] = 1
                                    else:
                                        all_trichars[trichar] += 1
                                    cur_char += 11

    top_trichars = heapq.nlargest(1000, all_trichars, key=all_trichars.get)  # fetch top 1000 trichars
    util.save_file(save_path, top_trichars)
Пример #5
0
def generate(saving_path):
    log('Generating <' + setup.feature + ',' + setup.domain + '> user vectors')

    for domain_dir in os.scandir(setup.database):
        if domain_dir.name == 'europe_data' and setup.domain == 'in':
            users = process_dir(domain_dir)
        elif domain_dir.name == 'non_europe_data' and setup.domain == 'out':
            users = process_dir(domain_dir)

    util.save_file(saving_path, users)
Пример #6
0
def generate_top_spelling_errors(save_path):
    log('Generating top spelling errors')
    all_spelling_errors = {}

    for domain_dir in os.scandir(setup.database):
        if domain_dir.name == 'europe_data' and setup.domain == 'in':

            for country_dir in os.scandir(domain_dir):
                country_name = str.split(os.path.basename(country_dir), '.')[1]
                log('Generating top spelling errors for ' + country_name)
                for user_dir in os.scandir(country_dir):
                    errors = []
                    for file_dir in os.scandir(user_dir):
                        file = open(file_dir, "r", encoding="utf-8")
                        lines = file.readlines()

                        for line in lines:  # parse lines within chunk text
                            json_data = json.loads(line)
                            for json_token in json_data:
                                if 'deletions' in json_token:
                                    if '[]' not in str(json_token['deletions']):  # not empty
                                        for component in json_token['deletions']:
                                            errors.append("del: " + component)

                                if 'insertions' in json_token:
                                    if '[]' not in str(json_token['insertions']):  # not empty
                                        for component in json_token['insertions']:
                                            errors.append("ins: " + component)

                                if 'replacements' in json_token:
                                    if '[]' not in str(json_token['replacements']):  # not empty
                                        for component in json_token['replacements']:
                                            errors.append("rep: " + str(component))

                    for error in errors:
                        if error not in all_spelling_errors.keys():
                            all_spelling_errors[error] = 1
                        else:
                            all_spelling_errors[error] += 1
    top_spelling_errors = heapq.nlargest(400, all_spelling_errors, key=all_spelling_errors.get)  # fetch top 400 spelling errors
    util.save_file(save_path, top_spelling_errors)