def generate_top_bipos(save_path): log('Generating top bichars') all_bipos = {} for domain_dir in os.scandir(setup.database): if domain_dir.name == 'europe_data' and setup.domain == 'in': for country_dir in os.scandir(domain_dir): country_name = str.split(os.path.basename(country_dir), '.')[1] log('Generating top bipos for ' + country_name) for user_dir in os.scandir(country_dir): for file_dir in os.scandir(user_dir): file = open(file_dir, "r", encoding="utf-8") lines = file.readlines() for line in lines: # parse lines within chunk text pos_tokens = re.split("'\), \('|'\), \(\"", line) for i in range(len(pos_tokens) - 2): bigram = "" bigram = bigram + re.split("', '|\", '", pos_tokens[i])[1] + " " bigram = bigram + re.split("', '|\", '", pos_tokens[i + 1])[1] if bigram not in all_bipos.keys(): all_bipos[bigram] = 1 else: all_bipos[bigram] += 1 top_bipos = heapq.nlargest(300, all_bipos, key=all_bipos.get) # fetch top 300 bipos util.save_file(save_path, top_bipos)
def provide_corpus_functionwords(numOfFunctionwords): global top_corpus_functionwords top_corpus_functionwords={} synchronized_functionwords_file_path = Path("vectors_handling/vectors/synchronized_functionwords/synchronized_functionwords.txt") if not util.exists(synchronized_functionwords_file_path): # can't find the file in memory log('Cannot find synchronized_functionwords file') # redundant corpus_functionwords = {} for domain_dir in os.scandir(setup.database): if domain_dir.name == 'europe_data' and setup.domain == 'in': for country_dir in os.scandir(domain_dir): country_name = str.split(os.path.basename(country_dir), '.')[1] log('Counting function words in ' + country_name) for user_dir in os.scandir(country_dir): for file_dir in os.scandir(user_dir): file = open(file_dir, "r", encoding="utf-8") lines = file.readlines() for line in lines: words = line.split() for word in words: if word in function_words_map.keys(): if word not in corpus_functionwords.keys(): corpus_functionwords[word] = 1 else: corpus_functionwords[word] += 1 top_corpus_functionwords = heapq.nlargest(numOfFunctionwords, corpus_functionwords, key=corpus_functionwords.get) util.save_file(synchronized_functionwords_file_path, top_corpus_functionwords) top_corpus_functionwords = util.load_file(synchronized_functionwords_file_path) return top_corpus_functionwords
def generate_top_unigrams(save_path): log('Generating top unigrams') all_unigrams = {} for domain_dir in os.scandir(setup.database): if domain_dir.name == 'europe_data' and setup.domain == 'in': for country_dir in os.scandir(domain_dir): country_name = str.split(os.path.basename(country_dir), '.')[1] log('Generating top unigrams for ' + country_name) for user_dir in os.scandir(country_dir): for file_dir in os.scandir(user_dir): file = open(file_dir, "r", encoding="utf-8") lines = file.readlines() for line in lines: unigrams = line.split() for token in unigrams: if token not in all_unigrams.keys(): all_unigrams[token] = 1 else: all_unigrams[token] += 1 top_unigrams = heapq.nlargest(1000, all_unigrams, key=all_unigrams.get) # fetch top 1000 unigrams util.save_file(save_path, top_unigrams)
def generate_top_trichars(save_path): log('Generating top trichars') all_trichars = {} for domain_dir in os.scandir(setup.database): if domain_dir.name == 'europe_data' and setup.domain == 'in': for country_dir in os.scandir(domain_dir): country_name = str.split(os.path.basename(country_dir), '.')[1] log('Generating top trichars for ' + country_name) for user_dir in os.scandir(country_dir): for file_dir in os.scandir(user_dir): file = open(file_dir, "r", encoding="utf-8") lines = file.readlines() for line in lines: # parse lines within chunk text if len(line) >= 11: cur_char = 0 while cur_char < len(line): trichar = line[cur_char + 1] + line[cur_char + 4] + line[cur_char + 7] if trichar not in all_trichars.keys(): all_trichars[trichar] = 1 else: all_trichars[trichar] += 1 cur_char += 11 top_trichars = heapq.nlargest(1000, all_trichars, key=all_trichars.get) # fetch top 1000 trichars util.save_file(save_path, top_trichars)
def generate(saving_path): log('Generating <' + setup.feature + ',' + setup.domain + '> user vectors') for domain_dir in os.scandir(setup.database): if domain_dir.name == 'europe_data' and setup.domain == 'in': users = process_dir(domain_dir) elif domain_dir.name == 'non_europe_data' and setup.domain == 'out': users = process_dir(domain_dir) util.save_file(saving_path, users)
def generate_top_spelling_errors(save_path): log('Generating top spelling errors') all_spelling_errors = {} for domain_dir in os.scandir(setup.database): if domain_dir.name == 'europe_data' and setup.domain == 'in': for country_dir in os.scandir(domain_dir): country_name = str.split(os.path.basename(country_dir), '.')[1] log('Generating top spelling errors for ' + country_name) for user_dir in os.scandir(country_dir): errors = [] for file_dir in os.scandir(user_dir): file = open(file_dir, "r", encoding="utf-8") lines = file.readlines() for line in lines: # parse lines within chunk text json_data = json.loads(line) for json_token in json_data: if 'deletions' in json_token: if '[]' not in str(json_token['deletions']): # not empty for component in json_token['deletions']: errors.append("del: " + component) if 'insertions' in json_token: if '[]' not in str(json_token['insertions']): # not empty for component in json_token['insertions']: errors.append("ins: " + component) if 'replacements' in json_token: if '[]' not in str(json_token['replacements']): # not empty for component in json_token['replacements']: errors.append("rep: " + str(component)) for error in errors: if error not in all_spelling_errors.keys(): all_spelling_errors[error] = 1 else: all_spelling_errors[error] += 1 top_spelling_errors = heapq.nlargest(400, all_spelling_errors, key=all_spelling_errors.get) # fetch top 400 spelling errors util.save_file(save_path, top_spelling_errors)