def generate_feature_vectors(): # step 1, generate total dict for each feature feature_total_dict = {} for i in range(8535): result_dict_file = "./data/words_only/data/%04d.dat" % i if StoreHelper.is_file_exist(result_dict_file): result_dict = StoreHelper.load_data(result_dict_file, {}) for feature in result_dict: DictHelper.append_dic_key(feature_total_dict, feature, result_dict[feature]) # step 2, generate feature vector for each feature feature_vector_header_dict = {} for feature in feature_total_dict: feature_list = [] for words_dict in feature_total_dict[feature]: feature_list.extend(words_dict.keys()) feature_list = list(set(feature_list)) feature_vector_header_dict[feature] = feature_list StoreHelper.store_data(feature_vector_header_dict, 'feature_vector_header.dat') # step 3, collect value for each feature vector feature_vector_dict = {} for feature in feature_vector_header_dict: feature_dict = {} feature_list = feature_vector_header_dict[feature] for i in range(8535): result_dict_file = "./data/words_only/data/%04d.dat" % i if StoreHelper.is_file_exist(result_dict_file): result_dict = StoreHelper.load_data(result_dict_file, {}) feature_dict[i] = [result_dict[feature][words] if words in result_dict[feature] else 0 for words in feature_list] feature_vector_dict[feature] = feature_dict # print (feature_vector_dict.keys()) # print (str([len(value[1]) for value in feature_vector_dict.values()])) StoreHelper.store_data(feature_vector_dict, 'feature_vector.dat') StoreHelper.save_file(feature_vector_dict, 'feature_vector.txt')
def get_only_words_in_5(): for i in range(8535): result_dict = {} words_dict_file = "./data/result_dict/%04d.dat" % i tfidf_dict_file = "./data/tfidf-dat/%04d.dat" % i if StoreHelper.is_file_exist(tfidf_dict_file): tfidf_dict = StoreHelper.load_data(tfidf_dict_file, {}) words_dict = StoreHelper.load_data(words_dict_file, {}) for _type in words_dict.keys(): result_dict[_type] = {} for word in words_dict[_type]: if word in tfidf_dict: result_dict[_type][word] = tfidf_dict[word] else: normal_word = SegmentHelper.normalize(word) if normal_word in tfidf_dict: print ("Saved by normalize for %s" % normal_word) result_dict[_type][word] = tfidf_dict[normal_word] else: print ("%s not found in %s" % (word, tfidf_dict_file)) # for _type in result_dict.keys(): # result_dict[_type] = DictHelper.get_sorted_list(result_dict[_type]) # print (result_dict.keys()) StoreHelper.store_data(result_dict, "./data/words_only/data/%04d.dat" % i) StoreHelper.save_file(result_dict, "./data/words_only/text/%04d.txt" % i)
def generate_company_list(): company_name_dict = StoreHelper.load_data('company_name.dic', {}) company_dict = {} for company_name in company_name_dict.values(): DictHelper.increase_dic_key(company_dict, company_name) print ("Totally %d company" % len(company_dict.keys())) StoreHelper.save_file(DictHelper.get_sorted_list(company_dict), "company_dict.txt")
def extract_company_name(): crawl_dict = Main.parse_file("./resource/url_list") company_name_dict = {} total_numbers = 0 for location in crawl_dict.keys(): file_name = "./data/post/%s.dat" % location positions = StoreHelper.load_data(file_name, []) print("Find %i record in %s" % (len(positions), file_name)) for url, position in positions: print("work on position: %4d" % total_numbers) company_list = HTMLHelper.get_company_name(position) if len(company_list) == 0: print( "Can not found company name in position %d url is %s" % (total_numbers, url)) elif len(company_list) == 1: company_name_dict[total_numbers] = SegmentHelper.normalize( company_list[0]) print("Found company name %s for position %d" % (company_list[0], total_numbers)) else: company_name_dict[total_numbers] = SegmentHelper.normalize( company_list[0]) print( "Found multi company name %s for position %d (choose the first one)" % (str(company_list), total_numbers)) total_numbers += 1 StoreHelper.save_file(company_name_dict, "company_name.txt") StoreHelper.store_data(company_name_dict, "company_name.dic") print("In summary, total downloaded %i records!" % total_numbers)
def run_script(src_folder, dst_folder, threshold, probability_dict_path=None, generate_dict=True): if probability_dict_path is None: probability_dict_path = path.join(dst_folder, 'probability.dict') if generate_dict is True: file_content_list = [] for i in range(8535): input_file = path.join(src_folder, "%04d.dat" % i) if StoreHelper.is_file_exist(input_file): file_content_list.append(StoreHelper.read_file(input_file)) else: print("%s not exist!" % input_file) probability_dict = SegmentHelper.generate_probability_dict( file_content_list) StoreHelper.store_data(probability_dict, probability_dict_path) print("Finished generate user dict") else: probability_dict = StoreHelper.load_data(probability_dict_path, {}) print("Load dict from file, %i records in dict" % len(probability_dict)) for i in range(8535): input_file = path.join(src_folder, "%04d.dat" % i) if StoreHelper.is_file_exist(input_file): output_file = path.join(dst_folder, "%04d.dat" % i) file_content = StoreHelper.read_file(input_file) word_list = [] for line in file_content.splitlines(): word_list.extend( SegmentHelper.phase_segment(probability_dict, line, threshold)) StoreHelper.save_file(os.linesep.join(word_list), output_file)
def generate_phase_list(): probability_dict = StoreHelper.load_data('./data/probability.dic', {}) print ("Get %i dict from file" % len(probability_dict)) for i in range(8535): text_file = "./data/clean_post_lemmatize/%04d.dat" % i if StoreHelper.is_file_exist(text_file): word_file = "./data/phrase_split/%04d.dat" % i context = StoreHelper.read_file(text_file) position_helper = PositionHelper(context) position_dict_list = position_helper.convert_2(probability_dict) StoreHelper.save_file("\n".join([str(item) for item in position_dict_list]), word_file) else: print ("%s not exist!" % text_file)
def get_tfidf(): blob_dict_list = Main.generate_blob_list() profile_dict_list = StoreHelper.load_data('./resource/merged_profile.dat', []) blob_dict_list.extend(profile_dict_list) tfidf = TFIDF(blob_dict_list) j = 0 for i in range(8535): text_file = "./data/clean_post_lemmatize/%04d.dat" % i if StoreHelper.is_file_exist(text_file): print("Working on %s article!" % text_file) tf_idf_dict = tfidf.get_tf_idf(blob_dict_list[j]) StoreHelper.store_data(tf_idf_dict, "./data/tfidf-dat/%04d.dat" % i) StoreHelper.save_file(DictHelper.get_sorted_list(tf_idf_dict), "./data/tfidf/%04d.dat" % i) j += 1
def generate_all_text(): crawl_dict = StoreHelper.parse_file("./resource/url_list") count_numbers = 0 for location in crawl_dict.keys(): file_name = "./data/post/%s.dat" % location positions = StoreHelper.load_data(file_name, []) for url, web_source in positions: if 'data scientist' in web_source.lower(): text_content = HTMLHelper.get_text(web_source) # text_dict = WordFrequency.get_frequency_dict(text_content) # output = [str(item) for item in text_dict] # output.extend([" ", text_content, " ", url]) StoreHelper.save_file(text_content, "./data/datascientist/%04d.txt" % count_numbers) count_numbers += 1 else: print ("Data Scientist not found in %s!" % url)
def convert_position(): skills_dict = StoreHelper.load_data("./resource/skills.dat", {}) print ("Get %i words from %s" %(len(skills_dict), "skills dict")) discipline_dict = StoreHelper.load_data("./resource/discipline.dat", {}) print("Get %i words from %s" % (len(discipline_dict), "discipline_dict")) education_dict = StoreHelper.load_data("./resource/education.dat", {}) print("Get %i words from %s" % (len(education_dict), "education_dict")) responsibility_dict = StoreHelper.load_data("./resource/responsibility.dat", {}) print("Get %i words from %s" % (len(responsibility_dict), "responsibility_dict")) for i in range(8535): text_file = "./data/clean_post_lemmatize/%04d.dat" % i if StoreHelper.is_file_exist(text_file): print ("working on file %s" % text_file) word_list = StoreHelper.load_data("./data/gensim_split/%04d.dat" % i, []) word_data = "./data/result_dict/%04d.dat" % i word_text = "./data/result_dict/%04d.txt" % i context = StoreHelper.read_file(text_file) position_helper = PositionHelper(context, word_list) result_dict = position_helper.convert(skills_dict, discipline_dict, education_dict, responsibility_dict, './resource/year_convert.dat') StoreHelper.save_file(result_dict, word_text) StoreHelper.store_data(result_dict, word_data)
def extract_download_data(): crawl_dict = Main.parse_file("./resource/url_list") total_numbers = 0 for location in crawl_dict.keys(): file_name = "./data/post/%s.dat" % location positions = StoreHelper.load_data(file_name, []) print("Find %i record in %s" % (len(positions), file_name)) for url, position in positions: # step 1, store origin file # output1 = "./data/text/%04d.html" % total_numbers # StoreHelper.save_file(position, output1) output2 = "./data/clean_post_without_header/%04d.dat" % total_numbers print("work on position: %4d" % total_numbers) status, content = HTMLHelper.get_post(position) if status is False: print("Error happen on extract %s" % url) # StoreHelper.save_file(position, output2) else: StoreHelper.save_file(HTMLHelper.post_clean(content), output2) total_numbers += 1 print("In summary, total downloaded %i records!" % total_numbers)
def generate_feature_list(): vector_data = StoreHelper.load_data('vector.dat', []) vector_dict = {'year': vector_data[0], 'education': vector_data[1], 'major': vector_data[2], 'skill': vector_data[3], 'responsibility': vector_data[4]} StoreHelper.save_file(vector_dict, 'vector.txt')