def run_script(): # Step 1, read url from text file crawl_dict = Main.parse_file("./resource/url_list") print crawl_dict # step 2, get job post url from web source for location, url_list in crawl_dict.items(): print("working on %s get job url" % location) if StoreHelper.is_file_exist("./data/url/%s.dat" % location): print("File already exist, ignore this steps!") continue url_set = set() for url in url_list: _list = CrawlHelper.get_all_job_url(url) url_set = url_set.union(set(_list)) print("Totally get %i url for %s\n" % (len(url_set), location)) if len(url_set) > 0: StoreHelper.store_data(list(url_set), "./data/url/%s.dat" % location) # step 3, get job post according to url for location, url_list in crawl_dict.items(): print("working on %s get job post information" % location) if StoreHelper.is_file_exist("./data/post/%s.dat" % location): print("File already exist, ignore this steps!") continue CrawlHelper.get_all_job_post("./data/url/%s.dat" % location, "./data/post/%s.dat" % location)
def generate_feature_vectors(): # step 1, generate total dict for each feature feature_total_dict = {} for i in range(8535): result_dict_file = "./data/words_only/data/%04d.dat" % i if StoreHelper.is_file_exist(result_dict_file): result_dict = StoreHelper.load_data(result_dict_file, {}) for feature in result_dict: DictHelper.append_dic_key(feature_total_dict, feature, result_dict[feature]) # step 2, generate feature vector for each feature feature_vector_header_dict = {} for feature in feature_total_dict: feature_list = [] for words_dict in feature_total_dict[feature]: feature_list.extend(words_dict.keys()) feature_list = list(set(feature_list)) feature_vector_header_dict[feature] = feature_list StoreHelper.store_data(feature_vector_header_dict, 'feature_vector_header.dat') # step 3, collect value for each feature vector feature_vector_dict = {} for feature in feature_vector_header_dict: feature_dict = {} feature_list = feature_vector_header_dict[feature] for i in range(8535): result_dict_file = "./data/words_only/data/%04d.dat" % i if StoreHelper.is_file_exist(result_dict_file): result_dict = StoreHelper.load_data(result_dict_file, {}) feature_dict[i] = [result_dict[feature][words] if words in result_dict[feature] else 0 for words in feature_list] feature_vector_dict[feature] = feature_dict # print (feature_vector_dict.keys()) # print (str([len(value[1]) for value in feature_vector_dict.values()])) StoreHelper.store_data(feature_vector_dict, 'feature_vector.dat') StoreHelper.save_file(feature_vector_dict, 'feature_vector.txt')
def run_script(src_folder, dst_folder, threshold, probability_dict_path=None, generate_dict=True): if probability_dict_path is None: probability_dict_path = path.join(dst_folder, 'probability.dict') if generate_dict is True: file_content_list = [] for i in range(8535): input_file = path.join(src_folder, "%04d.dat" % i) if StoreHelper.is_file_exist(input_file): file_content_list.append(StoreHelper.read_file(input_file)) else: print("%s not exist!" % input_file) probability_dict = SegmentHelper.generate_probability_dict( file_content_list) StoreHelper.store_data(probability_dict, probability_dict_path) print("Finished generate user dict") else: probability_dict = StoreHelper.load_data(probability_dict_path, {}) print("Load dict from file, %i records in dict" % len(probability_dict)) for i in range(8535): input_file = path.join(src_folder, "%04d.dat" % i) if StoreHelper.is_file_exist(input_file): output_file = path.join(dst_folder, "%04d.dat" % i) file_content = StoreHelper.read_file(input_file) word_list = [] for line in file_content.splitlines(): word_list.extend( SegmentHelper.phase_segment(probability_dict, line, threshold)) StoreHelper.save_file(os.linesep.join(word_list), output_file)
def get_only_words_in_5(): for i in range(8535): result_dict = {} words_dict_file = "./data/result_dict/%04d.dat" % i tfidf_dict_file = "./data/tfidf-dat/%04d.dat" % i if StoreHelper.is_file_exist(tfidf_dict_file): tfidf_dict = StoreHelper.load_data(tfidf_dict_file, {}) words_dict = StoreHelper.load_data(words_dict_file, {}) for _type in words_dict.keys(): result_dict[_type] = {} for word in words_dict[_type]: if word in tfidf_dict: result_dict[_type][word] = tfidf_dict[word] else: normal_word = SegmentHelper.normalize(word) if normal_word in tfidf_dict: print ("Saved by normalize for %s" % normal_word) result_dict[_type][word] = tfidf_dict[normal_word] else: print ("%s not found in %s" % (word, tfidf_dict_file)) # for _type in result_dict.keys(): # result_dict[_type] = DictHelper.get_sorted_list(result_dict[_type]) # print (result_dict.keys()) StoreHelper.store_data(result_dict, "./data/words_only/data/%04d.dat" % i) StoreHelper.save_file(result_dict, "./data/words_only/text/%04d.txt" % i)
def generate_token_dict(text_file_list): token_file_dict = {} for text_file in text_file_list: file_name = ntpath.basename(text_file) if StoreHelper.is_file_exist(text_file): file_content = StoreHelper.read_file(text_file) lowers = file_content.lower() no_punctuation = lowers.translate(None, string.punctuation) token_file_dict[file_name] = no_punctuation return token_file_dict
def generate_phase_list(): probability_dict = StoreHelper.load_data('./data/probability.dic', {}) print ("Get %i dict from file" % len(probability_dict)) for i in range(8535): text_file = "./data/clean_post_lemmatize/%04d.dat" % i if StoreHelper.is_file_exist(text_file): word_file = "./data/phrase_split/%04d.dat" % i context = StoreHelper.read_file(text_file) position_helper = PositionHelper(context) position_dict_list = position_helper.convert_2(probability_dict) StoreHelper.save_file("\n".join([str(item) for item in position_dict_list]), word_file) else: print ("%s not exist!" % text_file)
def get_post_vector(): year_list = [] education_list = [] major_list = [] skill_list = [] responsibility_list = [] position_tfidf_dict = {} for i in range(8535): phrase_dict_file = "./data/words_only/data/%04d.dat" % i if StoreHelper.is_file_exist(phrase_dict_file): phrase_dict = StoreHelper.load_data(phrase_dict_file, {}) position_tfidf_dict[i] = phrase_dict if 'working-year' in phrase_dict: year_list.extend(phrase_dict['working-year'].keys()) if 'education' in phrase_dict: education_list.extend(phrase_dict['education'].keys()) if 'major' in phrase_dict: major_list.extend(phrase_dict['major'].keys()) if 'skills' in phrase_dict: skill_list.extend(phrase_dict['skills'].keys()) if 'responsibility' in phrase_dict: responsibility_list.extend(phrase_dict['responsibility'].keys()) year_list = list(set(year_list)) print ("year list count: %d" % len(year_list)) education_list = list(set(education_list)) print("education_list list count: %d" % len(education_list)) major_list = list(set(major_list)) print("major_list list count: %d" % len(major_list)) skill_list = list(set(skill_list)) print("skill_list list count: %d" % len(skill_list)) responsibility_list = list(set(responsibility_list)) print("responsibility_list list count: %d" % len(responsibility_list)) StoreHelper.store_data([year_list, education_list, major_list, skill_list, responsibility_list], 'vector.dat') position_vectors = {} for i in range(8535): if i in position_tfidf_dict: position = [] for word in year_list: position.append(0 if word not in position_tfidf_dict[i]['working-year'] else position_tfidf_dict[i]['working-year'][word]) for word in education_list: position.append(0 if word not in position_tfidf_dict[i]['education'] else position_tfidf_dict[i]['education'][word]) for word in major_list: position.append(0 if word not in position_tfidf_dict[i]['major'] else position_tfidf_dict[i]['major'][word]) for word in skill_list: position.append(0 if word not in position_tfidf_dict[i]['skills'] else position_tfidf_dict[i]['skills'][word]) for word in responsibility_list: position.append(0 if word not in position_tfidf_dict[i]['responsibility'] else position_tfidf_dict[i]['responsibility'][word]) position_vectors[i] = position StoreHelper.store_data(position_vectors, './data/position_vector_01.dat')
def get_tfidf(): blob_dict_list = Main.generate_blob_list() profile_dict_list = StoreHelper.load_data('./resource/merged_profile.dat', []) blob_dict_list.extend(profile_dict_list) tfidf = TFIDF(blob_dict_list) j = 0 for i in range(8535): text_file = "./data/clean_post_lemmatize/%04d.dat" % i if StoreHelper.is_file_exist(text_file): print("Working on %s article!" % text_file) tf_idf_dict = tfidf.get_tf_idf(blob_dict_list[j]) StoreHelper.store_data(tf_idf_dict, "./data/tfidf-dat/%04d.dat" % i) StoreHelper.save_file(DictHelper.get_sorted_list(tf_idf_dict), "./data/tfidf/%04d.dat" % i) j += 1
def run_script(): # Step 1, read url from text file crawl_dict = StoreHelper.parse_file("./resource/url_list") # step 2 total_dict = {} for location, url_list in crawl_dict.items(): file_name = "./data/post/%s.dat" % location print (file_name) if StoreHelper.is_file_exist(file_name): total_dict.update(Main.get_frequency_from_file(file_name)) # sort dict total_dict = sorted(total_dict.items(), key=operator.itemgetter(1), reverse=True) StoreHelper.store_data(total_dict, "word_frequency.dat")
def convert_position(): skills_dict = StoreHelper.load_data("./resource/skills.dat", {}) print ("Get %i words from %s" %(len(skills_dict), "skills dict")) discipline_dict = StoreHelper.load_data("./resource/discipline.dat", {}) print("Get %i words from %s" % (len(discipline_dict), "discipline_dict")) education_dict = StoreHelper.load_data("./resource/education.dat", {}) print("Get %i words from %s" % (len(education_dict), "education_dict")) responsibility_dict = StoreHelper.load_data("./resource/responsibility.dat", {}) print("Get %i words from %s" % (len(responsibility_dict), "responsibility_dict")) for i in range(8535): text_file = "./data/clean_post_lemmatize/%04d.dat" % i if StoreHelper.is_file_exist(text_file): print ("working on file %s" % text_file) word_list = StoreHelper.load_data("./data/gensim_split/%04d.dat" % i, []) word_data = "./data/result_dict/%04d.dat" % i word_text = "./data/result_dict/%04d.txt" % i context = StoreHelper.read_file(text_file) position_helper = PositionHelper(context, word_list) result_dict = position_helper.convert(skills_dict, discipline_dict, education_dict, responsibility_dict, './resource/year_convert.dat') StoreHelper.save_file(result_dict, word_text) StoreHelper.store_data(result_dict, word_data)
def compute_tfidf(): blob_dict = {} total_dict = {} probability_dict = StoreHelper.load_data('./data/probability.dic', {}) print("Get %i dict from file" % len(probability_dict)) for i in range(8535): text_file = "./data/clean_post_lemmatize/%04d.dat" % i if StoreHelper.is_file_exist(text_file): context = StoreHelper.read_file(text_file) position_helper = PositionHelper(context) blob_dict[i] = position_helper.convert_2(probability_dict) tfidf = TFIDF(blob_dict.values()) for i in range(8535): if i in blob_dict: output_file = "./data/tfidf-dat/%04d.dat" % i print ("Working on %i article!" % i) tf_idf_dict = tfidf.get_tf_idf(blob_dict[i]) DictHelper.merge_dict(total_dict, tf_idf_dict) tf_idf_dict = {key: float("%.6f" % value) for key, value in tf_idf_dict.items()} StoreHelper.store_data(tf_idf_dict, output_file) StoreHelper.store_data(total_dict, "./data/tfidf.dat")
def generate_blob_list(): blob_list = [] for i in range(8535): phrase_dict_file = "./data/result_dict/%04d.dat" % i text_file = "./data/clean_post_lemmatize/%04d.dat" % i if StoreHelper.is_file_exist(phrase_dict_file): phrase_dict = StoreHelper.load_data(phrase_dict_file, {}) text_content = StoreHelper.read_file(text_file) word_list = [] for line in text_content.splitlines(): if line.endswith('.'): line = line[:-1] for word in line.split(' '): word_list.append(word) for _type in phrase_dict.keys(): for words in phrase_dict[_type]: for word in words.split(' '): if word in word_list: word_list.remove(word) word_list.append(words) blob_list.append(DictHelper.dict_from_count_list(word_list)) StoreHelper.store_data(blob_list, './data/blob_list.dat') return blob_list