Пример #1
0
 def update_probability_dict(dict_file, new_dict_file_list):
     probability_dict = StoreHelper.load_data(dict_file, {})
     for dict_file in new_dict_file_list:
         new_dict = StoreHelper.load_data(dict_file, {})
         print("Get %s with records: %i" % (dict_file, len(new_dict)))
         DictHelper.update_dict(probability_dict, new_dict)
     StoreHelper.store_data(probability_dict, dict_file)
Пример #2
0
    def crawl_post_information(ids_file, save_file):
        id_list = StoreHelper.load_data(ids_file)
        continue_not_found = 0
        post_list = {}
        total_count = len(id_list)
        current = 0
        for ids in id_list:
            id_url = urlparse.urljoin("https://www.linkedin.com/jobs/view/",
                                      ids)
            print("Working on url: %s" % id_url)
            current += 1
            print("progress report: %i in %i for %s" %
                  (current, total_count, ids_file))

            web_source = CrawlHelper.get_web_source(id_url)
            company = CrawlHelper.get_company_name(web_source)
            post_content = CrawlHelper.get_post_information(web_source)

            if post_content is None:
                print("No skills found for %s! Continue times %i" %
                      (id_url, continue_not_found))
                continue_not_found += 1
                if continue_not_found > 3:
                    break
            else:
                continue_not_found = 0
                if company in post_list.keys():
                    post_list[company].append((company, id_url, post_content))
                else:
                    post_list[company] = [(company, id_url, post_content)]
        StoreHelper.store_data(post_list, save_file)
        return current >= total_count - 1
Пример #3
0
 def convert_excel_to_dict(excel_file, dict_file, threshold=1):
     header, raw_data = ExcelHelper.read_excel(excel_file)
     row_number, column_number = raw_data.shape
     if column_number != 2:
         print("Attention! Excel file more than two column, please have a check! Use the first two column as dict")
     data_dict = {raw_data[i][0]: raw_data[i][1] for i in range(row_number)}
     # remove single words
     data_dict = {key.lower(): value for key, value in data_dict.items() if value > threshold}
     StoreHelper.store_data(data_dict, dict_file)
     print ("Generalized successfully and store dict to data file %s!" % dict_file)
Пример #4
0
 def extract_profile():
     _home_folder = '../resource/United States'
     profile_list = []
     for excel_file in ProfileHelper.generate_excel_list(_home_folder):
         profile_list.extend(
             ProfileHelper.generate_profile_list(excel_file))
         print("After merged file(%s) total profile list number is %d" %
               (excel_file, len(profile_list)))
     StoreHelper.store_data(profile_list, _home_folder + '/profile.dat')
     StoreHelper.save_file(profile_list, _home_folder + '/profile.txt')
Пример #5
0
 def generate_sentence_stream():
     sentence_stream = []
     for i in range(8535): #8535
         text_file = "../data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             print ("Working on %s" % text_file)
             file_content = StoreHelper.read_file(text_file)
             for line in file_content.splitlines():
                 sentence_stream.append(SegmentHelper.lemmatization(SegmentHelper.segment_text(line)))
     StoreHelper.store_data(sentence_stream, 'sentence_stream.dat')
     return sentence_stream
Пример #6
0
 def generate_phrase_dict():
     sentence_stream = StoreHelper.load_data('sentence_stream.dat', [])
     phrases = Phrases(sentence_stream, min_count=2, threshold=2)
     bi_gram = Phraser(phrases)
     for i in range(8535):
         text_file = "../data/clean_post_lemmatize/%04d.dat" % i
         output_file = "../data/gensim_split/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             print ("Working on %s" % text_file)
             phrase_list = GensimHelper.phrase_detection(bi_gram, text_file)
             phrase_list = [phrase.replace('_', ' ') for phrase in phrase_list]
             StoreHelper.store_data(phrase_list, output_file)
Пример #7
0
 def merge_dict():
     profile_dict_list = StoreHelper.load_data(
         '../resource/convert_profile.dat', [])
     merged_list = []
     for profile_dict in profile_dict_list:
         merged_dict = {}
         for feature in profile_dict:
             for key in profile_dict[feature]:
                 DictHelper.increase_dic_key(merged_dict, key)
         merged_list.append(merged_dict)
     StoreHelper.store_data(merged_list, '../resource/merged_profile.dat')
     StoreHelper.save_file(merged_list, '../resource/merged_profile.txt')
Пример #8
0
 def get_normalize_dict(excel_file, dict_file):
     probability_dict = {}
     header, raw_data = ExcelHelper.read_excel(excel_file)
     row_number, column_number = raw_data.shape
     print (raw_data.shape)
     if column_number != 2:
         print("Attention! Excel file more than two column, please have a check! Use the first two column as dict")
     for i in range(row_number):
         key = SegmentHelper.normalize(raw_data[i][0])
         # key = raw_data[i][0]
         if len(key.strip()) == 0:  # ignore single word
             continue
         probability_dict[key] = raw_data[i][1]
     StoreHelper.store_data(probability_dict, dict_file)
     print("Generalized successfully and store dict(%i) to data file %s!" % (len(probability_dict), dict_file))
Пример #9
0
 def get_combine_company_dict(store_data_file):
     company_dict = {}
     for tab in range(2):
         header, raw_data = ExcelHelper.read_excel('../resource/us_list_company2.xlsx', tab)
         row, column = raw_data.shape
         for i in range(row):
             company_name = SegmentHelper.normalize(str(raw_data[i][0]).strip())
             if len(company_name) > 0:
                 DictHelper.increase_dic_key(company_dict, raw_data[i][0])
     df = pd.read_csv('../resource/us_list_company_1.csv')
     name_serial = df['Name']
     for i in range(df.shape[0]):
         company_name = SegmentHelper.normalize(name_serial[i])
         if len(company_name) > 0:
             DictHelper.increase_dic_key(company_dict, name_serial[i])
     StoreHelper.store_data(company_dict, store_data_file)
Пример #10
0
    def convert_profile2(debug=False):
        education_phrase_dic = StoreHelper.load_data(
            '../resource/education.dat')
        discipline_phrase_dic = StoreHelper.load_data(
            '../resource/discipline.dat')
        skills_dic = StoreHelper.load_data('../resource/skills.dat')
        profile_vectors = StoreHelper.load_data(
            '../resource/United States/profile.dat', [])
        university_name_convert_dict = StoreHelper.load_data(
            '../university_name_convert.dic', {})
        vector_list = []

        count = 0
        total = len(profile_vectors)
        for _profile in profile_vectors:
            count += 1
            if debug:
                print("Profile convert progress: %d/%d" % (count, total))
            educations, majors = ProfileHelper.get_highest_education(
                _profile, education_phrase_dic, discipline_phrase_dic)
            profile_dict = {
                'skills':
                ProfileHelper.get_skills(_profile, skills_dic),
                'work_change_times':
                ProfileHelper.calculate_years(_profile)[0],
                'years':
                ProfileHelper.calculate_years(_profile)[1],
                'university':
                ProfileHelper.convert_university(_profile,
                                                 university_name_convert_dict),
                'education':
                educations,
                'company': [
                    SegmentHelper.normalize(company)
                    for company in _profile['company']
                ],
                'major':
                majors
            }
            vector_list.append(profile_dict)
        StoreHelper.store_data(vector_list, '../resource/convert_profile.dat')
        StoreHelper.save_file(vector_list, '../resource/convert_profile.txt')
Пример #11
0
    def convert_profile():
        education_phrase_dic = StoreHelper.load_data(
            '../resource/education.dat')
        discipline_phrase_dic = StoreHelper.load_data(
            '../resource/discipline.dat')
        skills_dic = StoreHelper.load_data('../resource/skills.dat')

        profile_vectors = StoreHelper.load_data(
            '../resource/United States/profile.dat', [])
        vector_list = []
        for _profile in profile_vectors:
            educations, majors = ProfileHelper.get_highest_education(
                _profile, education_phrase_dic, discipline_phrase_dic)
            profile_dict = {
                'skills': ProfileHelper.get_skills(_profile, skills_dic),
                'years': ProfileHelper.get_years(_profile),
                'education': educations,
                'major': majors
            }
            vector_list.append(profile_dict)
        StoreHelper.store_data(vector_list, '../resource/convert_profile.dat')
        StoreHelper.save_file(vector_list, '../resource/convert_profile.txt')
Пример #12
0
 def print_label(label, index_list, cluster_number=None):
     if cluster_number is None:
         label_dict = DictHelper.dict_from_count_list(label)
         print("\t".join([str(i) for i in label]))
         print(label_dict)
         print("max cluster number: %i" % max(label_dict))
         print("min cluster number: %i" % min(label_dict))
         position_tag = {}
         for i in range(len(label)):
             DictHelper.append_dic_key(position_tag, label[i],
                                       int(index_list[i]))
         for key, value in position_tag.items():
             print("%s: %s" % (key, value))
         StoreHelper.store_data(position_tag, 'position_tag.dat')
         StoreHelper.save_file(position_tag, 'position_tag.txt')
     else:
         length = len(label)
         clusters = [[str(j) for j in range(length) if label[j] == i]
                     for i in range(cluster_number)]
         for i in range(len(clusters)):
             print("Cluster %i has %i position, position: %s" %
                   (i, len(clusters[i]), str(clusters[i])))
Пример #13
0
 def get_all_job_post(url_file, post_file):
     post_info_list = []
     for url in StoreHelper.load_data(url_file, {}):
         web_content = CrawlHelper.get_web_source(url)
         post_info_list.append((url, web_content))
     StoreHelper.store_data(post_info_list, post_file)
Пример #14
0
 def save(self, file_name="pattern_relationship.dat"):
     StoreHelper.store_data(self, file_name)