def run_script(src_folder, dst_folder, threshold, probability_dict_path=None, generate_dict=True): if probability_dict_path is None: probability_dict_path = path.join(dst_folder, 'probability.dict') if generate_dict is True: file_content_list = [] for i in range(8535): input_file = path.join(src_folder, "%04d.dat" % i) if StoreHelper.is_file_exist(input_file): file_content_list.append(StoreHelper.read_file(input_file)) else: print("%s not exist!" % input_file) probability_dict = SegmentHelper.generate_probability_dict( file_content_list) StoreHelper.store_data(probability_dict, probability_dict_path) print("Finished generate user dict") else: probability_dict = StoreHelper.load_data(probability_dict_path, {}) print("Load dict from file, %i records in dict" % len(probability_dict)) for i in range(8535): input_file = path.join(src_folder, "%04d.dat" % i) if StoreHelper.is_file_exist(input_file): output_file = path.join(dst_folder, "%04d.dat" % i) file_content = StoreHelper.read_file(input_file) word_list = [] for line in file_content.splitlines(): word_list.extend( SegmentHelper.phase_segment(probability_dict, line, threshold)) StoreHelper.save_file(os.linesep.join(word_list), output_file)
def generate_token_dict(text_file_list): token_file_dict = {} for text_file in text_file_list: file_name = ntpath.basename(text_file) if StoreHelper.is_file_exist(text_file): file_content = StoreHelper.read_file(text_file) lowers = file_content.lower() no_punctuation = lowers.translate(None, string.punctuation) token_file_dict[file_name] = no_punctuation return token_file_dict
def generate_phase_list(): probability_dict = StoreHelper.load_data('./data/probability.dic', {}) print ("Get %i dict from file" % len(probability_dict)) for i in range(8535): text_file = "./data/clean_post_lemmatize/%04d.dat" % i if StoreHelper.is_file_exist(text_file): word_file = "./data/phrase_split/%04d.dat" % i context = StoreHelper.read_file(text_file) position_helper = PositionHelper(context) position_dict_list = position_helper.convert_2(probability_dict) StoreHelper.save_file("\n".join([str(item) for item in position_dict_list]), word_file) else: print ("%s not exist!" % text_file)
def convert_position(): skills_dict = StoreHelper.load_data("./resource/skills.dat", {}) print ("Get %i words from %s" %(len(skills_dict), "skills dict")) discipline_dict = StoreHelper.load_data("./resource/discipline.dat", {}) print("Get %i words from %s" % (len(discipline_dict), "discipline_dict")) education_dict = StoreHelper.load_data("./resource/education.dat", {}) print("Get %i words from %s" % (len(education_dict), "education_dict")) responsibility_dict = StoreHelper.load_data("./resource/responsibility.dat", {}) print("Get %i words from %s" % (len(responsibility_dict), "responsibility_dict")) for i in range(8535): text_file = "./data/clean_post_lemmatize/%04d.dat" % i if StoreHelper.is_file_exist(text_file): print ("working on file %s" % text_file) word_list = StoreHelper.load_data("./data/gensim_split/%04d.dat" % i, []) word_data = "./data/result_dict/%04d.dat" % i word_text = "./data/result_dict/%04d.txt" % i context = StoreHelper.read_file(text_file) position_helper = PositionHelper(context, word_list) result_dict = position_helper.convert(skills_dict, discipline_dict, education_dict, responsibility_dict, './resource/year_convert.dat') StoreHelper.save_file(result_dict, word_text) StoreHelper.store_data(result_dict, word_data)
def compute_tfidf(): blob_dict = {} total_dict = {} probability_dict = StoreHelper.load_data('./data/probability.dic', {}) print("Get %i dict from file" % len(probability_dict)) for i in range(8535): text_file = "./data/clean_post_lemmatize/%04d.dat" % i if StoreHelper.is_file_exist(text_file): context = StoreHelper.read_file(text_file) position_helper = PositionHelper(context) blob_dict[i] = position_helper.convert_2(probability_dict) tfidf = TFIDF(blob_dict.values()) for i in range(8535): if i in blob_dict: output_file = "./data/tfidf-dat/%04d.dat" % i print ("Working on %i article!" % i) tf_idf_dict = tfidf.get_tf_idf(blob_dict[i]) DictHelper.merge_dict(total_dict, tf_idf_dict) tf_idf_dict = {key: float("%.6f" % value) for key, value in tf_idf_dict.items()} StoreHelper.store_data(tf_idf_dict, output_file) StoreHelper.store_data(total_dict, "./data/tfidf.dat")
def generate_blob_list(): blob_list = [] for i in range(8535): phrase_dict_file = "./data/result_dict/%04d.dat" % i text_file = "./data/clean_post_lemmatize/%04d.dat" % i if StoreHelper.is_file_exist(phrase_dict_file): phrase_dict = StoreHelper.load_data(phrase_dict_file, {}) text_content = StoreHelper.read_file(text_file) word_list = [] for line in text_content.splitlines(): if line.endswith('.'): line = line[:-1] for word in line.split(' '): word_list.append(word) for _type in phrase_dict.keys(): for words in phrase_dict[_type]: for word in words.split(' '): if word in word_list: word_list.remove(word) word_list.append(words) blob_list.append(DictHelper.dict_from_count_list(word_list)) StoreHelper.store_data(blob_list, './data/blob_list.dat') return blob_list
def run_cluster(): final_vector = [[0 for j in range(310)] for i in range(4980)] key_set = StoreHelper.load_data("./resource/feature.dat", {}).keys() print("key set length: %i" % len(key_set)) blob_dict_list = [] skills_dict = StoreHelper.load_data("./resource/skills.dat", {}) discipline_dict = StoreHelper.load_data("./resource/discipline.dat", {}) education_dict = StoreHelper.load_data("./resource/education.dat", {}) for i in range(4980): text_file = "./data/datascientist/%04d.txt" % i context = StoreHelper.read_file(text_file) position_helper = PositionHelper(context) blob_dict_list.append(position_helper.convert(skills_dict, discipline_dict, education_dict)[4]) tfidf = TFIDF(blob_dict_list) for i in range(4980): print("Working on %i article!" % i) tf_idf_dict = tfidf.get_tf_idf(blob_dict_list[i]) # tf_idf_dict = {key: "%.6f" % value for key, value in tf_idf_dict.items()} for j in range(310): if key_set[j] in tf_idf_dict: final_vector[i][j] = tf_idf_dict[key_set[j]] StoreHelper.store_data(final_vector, "./data/vectors.dat")