def update_probability_dict(dict_file, new_dict_file_list): probability_dict = StoreHelper.load_data(dict_file, {}) for dict_file in new_dict_file_list: new_dict = StoreHelper.load_data(dict_file, {}) print("Get %s with records: %i" % (dict_file, len(new_dict))) DictHelper.update_dict(probability_dict, new_dict) StoreHelper.store_data(probability_dict, dict_file)
def crawl_post_information(ids_file, save_file): id_list = StoreHelper.load_data(ids_file) continue_not_found = 0 post_list = {} total_count = len(id_list) current = 0 for ids in id_list: id_url = urlparse.urljoin("https://www.linkedin.com/jobs/view/", ids) print("Working on url: %s" % id_url) current += 1 print("progress report: %i in %i for %s" % (current, total_count, ids_file)) web_source = CrawlHelper.get_web_source(id_url) company = CrawlHelper.get_company_name(web_source) post_content = CrawlHelper.get_post_information(web_source) if post_content is None: print("No skills found for %s! Continue times %i" % (id_url, continue_not_found)) continue_not_found += 1 if continue_not_found > 3: break else: continue_not_found = 0 if company in post_list.keys(): post_list[company].append((company, id_url, post_content)) else: post_list[company] = [(company, id_url, post_content)] StoreHelper.store_data(post_list, save_file) return current >= total_count - 1
def convert_profile2(debug=False): education_phrase_dic = StoreHelper.load_data( '../resource/education.dat') discipline_phrase_dic = StoreHelper.load_data( '../resource/discipline.dat') skills_dic = StoreHelper.load_data('../resource/skills.dat') profile_vectors = StoreHelper.load_data( '../resource/United States/profile.dat', []) university_name_convert_dict = StoreHelper.load_data( '../university_name_convert.dic', {}) vector_list = [] count = 0 total = len(profile_vectors) for _profile in profile_vectors: count += 1 if debug: print("Profile convert progress: %d/%d" % (count, total)) educations, majors = ProfileHelper.get_highest_education( _profile, education_phrase_dic, discipline_phrase_dic) profile_dict = { 'skills': ProfileHelper.get_skills(_profile, skills_dic), 'work_change_times': ProfileHelper.calculate_years(_profile)[0], 'years': ProfileHelper.calculate_years(_profile)[1], 'university': ProfileHelper.convert_university(_profile, university_name_convert_dict), 'education': educations, 'company': [ SegmentHelper.normalize(company) for company in _profile['company'] ], 'major': majors } vector_list.append(profile_dict) StoreHelper.store_data(vector_list, '../resource/convert_profile.dat') StoreHelper.save_file(vector_list, '../resource/convert_profile.txt')
def split_dict(): phase_dict = StoreHelper.load_data("phase_dict.dat", {}) phase_dict_single = {} phase_dict_double = {} for key, value in phase_dict.items(): if '_' in key: phase_dict_double[key] = value else: phase_dict_single[key] = value StoreHelper.save_file(DictHelper.get_sorted_list(phase_dict_single), 'phase_dict_single.txt') StoreHelper.save_file(DictHelper.get_sorted_list(phase_dict_double), 'phase_dict_double.txt')
def generate_phrase_dict(): sentence_stream = StoreHelper.load_data('sentence_stream.dat', []) phrases = Phrases(sentence_stream, min_count=2, threshold=2) bi_gram = Phraser(phrases) for i in range(8535): text_file = "../data/clean_post_lemmatize/%04d.dat" % i output_file = "../data/gensim_split/%04d.dat" % i if StoreHelper.is_file_exist(text_file): print ("Working on %s" % text_file) phrase_list = GensimHelper.phrase_detection(bi_gram, text_file) phrase_list = [phrase.replace('_', ' ') for phrase in phrase_list] StoreHelper.store_data(phrase_list, output_file)
def merge_dict(): profile_dict_list = StoreHelper.load_data( '../resource/convert_profile.dat', []) merged_list = [] for profile_dict in profile_dict_list: merged_dict = {} for feature in profile_dict: for key in profile_dict[feature]: DictHelper.increase_dic_key(merged_dict, key) merged_list.append(merged_dict) StoreHelper.store_data(merged_list, '../resource/merged_profile.dat') StoreHelper.save_file(merged_list, '../resource/merged_profile.txt')
def calculate_full_frequency(): html_list = StoreHelper.load_data("../data/post/Delaware.dat", []) words_frequency_list = [] for _url, _web_source in html_list: clean_content = HTMLHelper.get_text(_web_source) text_dict = WordFrequency.get_frequency_dict(clean_content) text_dict = sorted(text_dict.items(), key=operator.itemgetter(1), reverse=True) words_frequency_list.append(text_dict) for text_dict in words_frequency_list: print(text_dict)
def convert_profile(): education_phrase_dic = StoreHelper.load_data( '../resource/education.dat') discipline_phrase_dic = StoreHelper.load_data( '../resource/discipline.dat') skills_dic = StoreHelper.load_data('../resource/skills.dat') profile_vectors = StoreHelper.load_data( '../resource/United States/profile.dat', []) vector_list = [] for _profile in profile_vectors: educations, majors = ProfileHelper.get_highest_education( _profile, education_phrase_dic, discipline_phrase_dic) profile_dict = { 'skills': ProfileHelper.get_skills(_profile, skills_dic), 'years': ProfileHelper.get_years(_profile), 'education': educations, 'major': majors } vector_list.append(profile_dict) StoreHelper.store_data(vector_list, '../resource/convert_profile.dat') StoreHelper.save_file(vector_list, '../resource/convert_profile.txt')
def _get_working_year_words(self, year_convert_file=None): year_list = TextHelper.get_years_pattern(self.raw_position) if len(year_list) == 0: default_year_requirement = "[0]" self.new_words_list.append(default_year_requirement) year_list = [default_year_requirement] elif year_convert_file is not None: year_convert_dict = StoreHelper.load_data(year_convert_file, {}) year_list = [ year_convert_dict[item] for item in year_list if item in year_convert_dict ] return DictHelper.dict_from_count_list(year_list)
def get_all_job_post(url_file, post_file): post_info_list = [] for url in StoreHelper.load_data(url_file, {}): web_content = CrawlHelper.get_web_source(url) post_info_list.append((url, web_content)) StoreHelper.store_data(post_info_list, post_file)
@staticmethod def run_script(vector_list): ClusterHelper.plot_clusters(np.array(vector_list), hdbscan.HDBSCAN, (), {'min_cluster_size': 15}) @staticmethod def mean_shift_cluster(vector_list): np_array = np.array(vector_list) bandwidth = estimate_bandwidth(np_array, quantile=0.2, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(np_array) ClusterHelper.print_label(ms.labels_) @staticmethod def birch_cluster(vector_list, index_list): np_array = np.array(vector_list, dtype=float) brc = Birch(branching_factor=50, threshold=0.05, compute_labels=True) brc.fit(np_array) label = brc.predict(np_array) ClusterHelper.print_label(label, index_list) if __name__ == '__main__': # _vector_list = StoreHelper.load_data("../data/vectors.dat") # ClusterHelper.mean_shift_cluster(_vector_list) # ClusterHelper.birch_cluster(_vector_list) # ClusterHelper.run_script(_vector_list) position_dict = StoreHelper.load_data("../data/position_vector_01.dat", {}) _vector_list = position_dict.values() _index_list = position_dict.keys() ClusterHelper.birch_cluster(_vector_list, _index_list)
fig = plt.figure(1, figsize=(8, 6)) plt.clf() ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134) plt.cla() # for label in range(cluster_number): # name = "cluster %i" % label # ax.text3D(X[y == label, 33].mean(), # X[y == label, 99].mean(), # X[y == label, 112].mean(), '', # horizontalalignment='center', # bbox=dict(alpha=.5, edgecolor='w', facecolor='w')) # y = np.choose(y, [0, 1, 2]).astype(np.float) ax.scatter(X[:, 15], X[:, 17], X[:, 23], c=y) ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) ax.set_xlabel('Petal width') ax.set_ylabel('Sepal length') ax.set_zlabel('Petal length') plt.show() if __name__ == '__main__': _vector_list = StoreHelper.load_data("../data/vectors.dat") PlotHelper.plot_k_means(_vector_list)
return self.phrase_dict def convert_2(self, probability_dict): year_phase_list = self._get_working_year_words() phrase_list = self._remove_conjunction_segment(probability_dict) phrase_list.extend(year_phase_list) return DictHelper.dict_from_count_list(phrase_list) def _remove_conjunction_segment(self, probability_dict): phase_list = [] sentence_list = [] word_list = SegmentHelper.segment_text(self.raw_position) word_group = [] for word in word_list: if word in stopwords.words('english'): if len(word_group) > 0: sentence_list.append(' '.join(word_group)) word_group = [] else: word_group.append(word) if len(word_group) > 0: sentence_list.append(' '.join(word_group)) for sentence in sentence_list: phase_list.extend( SegmentHelper.phase_segment(probability_dict, sentence, 0.05)) return phase_list if __name__ == '__main__': year_convert = StoreHelper.load_data('../resource/year_convert.dat', {}) print year_convert['four year']
def build_from_file(file_name="pattern_relationship.dat"): return StoreHelper.load_data(file_name)
post = soup.find('div', id='jobcopy') if post is not None: return True, post post = soup.find('div', id='bodycol') if post is not None: return True, post post = soup.find('div', id='JobDescription') return (True, post) if post is not None else (False, None) @staticmethod def post_clean(soup_element): styles = soup_element.find('style') if styles is not None: styles.decompose() shorts = soup_element.find('div', {'ng-if': 'featuredJobModel.showAbstract'}) if shorts is not None: shorts.decompose() a_link = soup_element.find('a') if a_link is not None: a_link.decompose() return os.linesep.join( [s for s in soup_element.text.splitlines() if len(s.strip()) > 0]) if __name__ == '__main__': _html_list = StoreHelper.load_data("../data/post/Delaware.dat", []) _web_source = _html_list[4][1] print(_html_list[4][0]) # print(_web_source) print(HTMLHelper.get_text(_web_source))