def update_probability_dict(dict_file, new_dict_file_list): probability_dict = StoreHelper.load_data(dict_file, {}) for dict_file in new_dict_file_list: new_dict = StoreHelper.load_data(dict_file, {}) print("Get %s with records: %i" % (dict_file, len(new_dict))) DictHelper.update_dict(probability_dict, new_dict) StoreHelper.store_data(probability_dict, dict_file)
def crawl_post_information(ids_file, save_file): id_list = StoreHelper.load_data(ids_file) continue_not_found = 0 post_list = {} total_count = len(id_list) current = 0 for ids in id_list: id_url = urlparse.urljoin("https://www.linkedin.com/jobs/view/", ids) print("Working on url: %s" % id_url) current += 1 print("progress report: %i in %i for %s" % (current, total_count, ids_file)) web_source = CrawlHelper.get_web_source(id_url) company = CrawlHelper.get_company_name(web_source) post_content = CrawlHelper.get_post_information(web_source) if post_content is None: print("No skills found for %s! Continue times %i" % (id_url, continue_not_found)) continue_not_found += 1 if continue_not_found > 3: break else: continue_not_found = 0 if company in post_list.keys(): post_list[company].append((company, id_url, post_content)) else: post_list[company] = [(company, id_url, post_content)] StoreHelper.store_data(post_list, save_file) return current >= total_count - 1
def convert_excel_to_dict(excel_file, dict_file, threshold=1): header, raw_data = ExcelHelper.read_excel(excel_file) row_number, column_number = raw_data.shape if column_number != 2: print("Attention! Excel file more than two column, please have a check! Use the first two column as dict") data_dict = {raw_data[i][0]: raw_data[i][1] for i in range(row_number)} # remove single words data_dict = {key.lower(): value for key, value in data_dict.items() if value > threshold} StoreHelper.store_data(data_dict, dict_file) print ("Generalized successfully and store dict to data file %s!" % dict_file)
def extract_profile(): _home_folder = '../resource/United States' profile_list = [] for excel_file in ProfileHelper.generate_excel_list(_home_folder): profile_list.extend( ProfileHelper.generate_profile_list(excel_file)) print("After merged file(%s) total profile list number is %d" % (excel_file, len(profile_list))) StoreHelper.store_data(profile_list, _home_folder + '/profile.dat') StoreHelper.save_file(profile_list, _home_folder + '/profile.txt')
def generate_sentence_stream(): sentence_stream = [] for i in range(8535): #8535 text_file = "../data/clean_post_lemmatize/%04d.dat" % i if StoreHelper.is_file_exist(text_file): print ("Working on %s" % text_file) file_content = StoreHelper.read_file(text_file) for line in file_content.splitlines(): sentence_stream.append(SegmentHelper.lemmatization(SegmentHelper.segment_text(line))) StoreHelper.store_data(sentence_stream, 'sentence_stream.dat') return sentence_stream
def generate_phrase_dict(): sentence_stream = StoreHelper.load_data('sentence_stream.dat', []) phrases = Phrases(sentence_stream, min_count=2, threshold=2) bi_gram = Phraser(phrases) for i in range(8535): text_file = "../data/clean_post_lemmatize/%04d.dat" % i output_file = "../data/gensim_split/%04d.dat" % i if StoreHelper.is_file_exist(text_file): print ("Working on %s" % text_file) phrase_list = GensimHelper.phrase_detection(bi_gram, text_file) phrase_list = [phrase.replace('_', ' ') for phrase in phrase_list] StoreHelper.store_data(phrase_list, output_file)
def merge_dict(): profile_dict_list = StoreHelper.load_data( '../resource/convert_profile.dat', []) merged_list = [] for profile_dict in profile_dict_list: merged_dict = {} for feature in profile_dict: for key in profile_dict[feature]: DictHelper.increase_dic_key(merged_dict, key) merged_list.append(merged_dict) StoreHelper.store_data(merged_list, '../resource/merged_profile.dat') StoreHelper.save_file(merged_list, '../resource/merged_profile.txt')
def get_normalize_dict(excel_file, dict_file): probability_dict = {} header, raw_data = ExcelHelper.read_excel(excel_file) row_number, column_number = raw_data.shape print (raw_data.shape) if column_number != 2: print("Attention! Excel file more than two column, please have a check! Use the first two column as dict") for i in range(row_number): key = SegmentHelper.normalize(raw_data[i][0]) # key = raw_data[i][0] if len(key.strip()) == 0: # ignore single word continue probability_dict[key] = raw_data[i][1] StoreHelper.store_data(probability_dict, dict_file) print("Generalized successfully and store dict(%i) to data file %s!" % (len(probability_dict), dict_file))
def get_combine_company_dict(store_data_file): company_dict = {} for tab in range(2): header, raw_data = ExcelHelper.read_excel('../resource/us_list_company2.xlsx', tab) row, column = raw_data.shape for i in range(row): company_name = SegmentHelper.normalize(str(raw_data[i][0]).strip()) if len(company_name) > 0: DictHelper.increase_dic_key(company_dict, raw_data[i][0]) df = pd.read_csv('../resource/us_list_company_1.csv') name_serial = df['Name'] for i in range(df.shape[0]): company_name = SegmentHelper.normalize(name_serial[i]) if len(company_name) > 0: DictHelper.increase_dic_key(company_dict, name_serial[i]) StoreHelper.store_data(company_dict, store_data_file)
def convert_profile2(debug=False): education_phrase_dic = StoreHelper.load_data( '../resource/education.dat') discipline_phrase_dic = StoreHelper.load_data( '../resource/discipline.dat') skills_dic = StoreHelper.load_data('../resource/skills.dat') profile_vectors = StoreHelper.load_data( '../resource/United States/profile.dat', []) university_name_convert_dict = StoreHelper.load_data( '../university_name_convert.dic', {}) vector_list = [] count = 0 total = len(profile_vectors) for _profile in profile_vectors: count += 1 if debug: print("Profile convert progress: %d/%d" % (count, total)) educations, majors = ProfileHelper.get_highest_education( _profile, education_phrase_dic, discipline_phrase_dic) profile_dict = { 'skills': ProfileHelper.get_skills(_profile, skills_dic), 'work_change_times': ProfileHelper.calculate_years(_profile)[0], 'years': ProfileHelper.calculate_years(_profile)[1], 'university': ProfileHelper.convert_university(_profile, university_name_convert_dict), 'education': educations, 'company': [ SegmentHelper.normalize(company) for company in _profile['company'] ], 'major': majors } vector_list.append(profile_dict) StoreHelper.store_data(vector_list, '../resource/convert_profile.dat') StoreHelper.save_file(vector_list, '../resource/convert_profile.txt')
def convert_profile(): education_phrase_dic = StoreHelper.load_data( '../resource/education.dat') discipline_phrase_dic = StoreHelper.load_data( '../resource/discipline.dat') skills_dic = StoreHelper.load_data('../resource/skills.dat') profile_vectors = StoreHelper.load_data( '../resource/United States/profile.dat', []) vector_list = [] for _profile in profile_vectors: educations, majors = ProfileHelper.get_highest_education( _profile, education_phrase_dic, discipline_phrase_dic) profile_dict = { 'skills': ProfileHelper.get_skills(_profile, skills_dic), 'years': ProfileHelper.get_years(_profile), 'education': educations, 'major': majors } vector_list.append(profile_dict) StoreHelper.store_data(vector_list, '../resource/convert_profile.dat') StoreHelper.save_file(vector_list, '../resource/convert_profile.txt')
def print_label(label, index_list, cluster_number=None): if cluster_number is None: label_dict = DictHelper.dict_from_count_list(label) print("\t".join([str(i) for i in label])) print(label_dict) print("max cluster number: %i" % max(label_dict)) print("min cluster number: %i" % min(label_dict)) position_tag = {} for i in range(len(label)): DictHelper.append_dic_key(position_tag, label[i], int(index_list[i])) for key, value in position_tag.items(): print("%s: %s" % (key, value)) StoreHelper.store_data(position_tag, 'position_tag.dat') StoreHelper.save_file(position_tag, 'position_tag.txt') else: length = len(label) clusters = [[str(j) for j in range(length) if label[j] == i] for i in range(cluster_number)] for i in range(len(clusters)): print("Cluster %i has %i position, position: %s" % (i, len(clusters[i]), str(clusters[i])))
def get_all_job_post(url_file, post_file): post_info_list = [] for url in StoreHelper.load_data(url_file, {}): web_content = CrawlHelper.get_web_source(url) post_info_list.append((url, web_content)) StoreHelper.store_data(post_info_list, post_file)
def save(self, file_name="pattern_relationship.dat"): StoreHelper.store_data(self, file_name)