Exemplo n.º 1
0
def main():
    in_directory = "/Users/a.merenova/Desktop/projects/python/infosearch/files"
    out_directory = "/Users/a.merenova/Desktop/projects/python/infosearch/lems"
    file_names = os.listdir(in_directory)
    os.mkdir(out_directory)
    for i in range(len(file_names)):
        analyze_text(fh.get_file_name(file_names[i], in_directory), fh.get_file_name(file_names[i], out_directory))
Exemplo n.º 2
0
def get_all_words_in_file(file_name):
    all_words_in_file = list()
    with open(fh.get_file_name(file_name, lem_directory), 'r') as file:
        for line in file:
            words = line.lower().split()
            all_words_in_file = all_words_in_file + list(words)
    file.close()
    return all_words_in_file
Exemplo n.º 3
0
def convert_index_file_to_dict():
    result_dict = {}
    with open(fh.get_file_name('index.txt', main_directory),
              'r') as index_file:
        for line in index_file:
            index = line.split('.')[0]
            page_name = line.split(' ')[1]
            result_dict.update({index: page_name})
    return result_dict
Exemplo n.º 4
0
def main():
    file_names = os.listdir(lem_directory)
    word_idf = calculate_idf(file_names)
    # idf_result_file = fh.get_file_name('idf.csv', res_directory)
    # tf_idf_result_file = fh.get_file_name('tf_idf.csv', res_directory)
    for file_name in file_names:
        file = open(fh.get_file_name(file_name, res_directory), 'w')
        all_words = get_all_words_in_file(file_name)
        words_tf = {}
        for word in all_words:
            word_tf = calculate_tf_by_word(word, all_words)
            words_tf.update({word: word_tf})
        for key in words_tf:
            tf = words_tf.get(key)
            idf = word_idf.get(key)
            tf_idf = calculate_tf_idf(tf, idf)
            file.write(key + ' tf=' + str(tf) + ' idf=' + str(idf) +
                       ' tf-idf=' + str(tf_idf) + '\n')
Exemplo n.º 5
0
def write_stat_to_file(stat):
    out_file_name = fh.get_file_name('stat.txt', out_directory)
    file = open(str(out_file_name), 'w')
    for key in stat:
        file.write(key + ' ' + str(stat.get(key)) + "\n")
Exemplo n.º 6
0
def collect_stat():
    file_names = os.listdir(lem_directory)
    stat = dict()
    for file_name in file_names:
        collect_statistic(stat, fh.get_file_name(file_name, lem_directory))
    return stat