Пример #1
0
    def generate_feature_vectors():
        # step 1, generate total dict for each feature
        feature_total_dict = {}
        for i in range(8535):
            result_dict_file = "./data/words_only/data/%04d.dat" % i
            if StoreHelper.is_file_exist(result_dict_file):
                result_dict = StoreHelper.load_data(result_dict_file, {})
                for feature in result_dict:
                    DictHelper.append_dic_key(feature_total_dict, feature, result_dict[feature])

        # step 2, generate feature vector for each feature
        feature_vector_header_dict = {}
        for feature in feature_total_dict:
            feature_list = []
            for words_dict in feature_total_dict[feature]:
                feature_list.extend(words_dict.keys())
            feature_list = list(set(feature_list))
            feature_vector_header_dict[feature] = feature_list
        StoreHelper.store_data(feature_vector_header_dict, 'feature_vector_header.dat')

        # step 3, collect value for each feature vector
        feature_vector_dict = {}
        for feature in feature_vector_header_dict:
            feature_dict = {}
            feature_list = feature_vector_header_dict[feature]
            for i in range(8535):
                result_dict_file = "./data/words_only/data/%04d.dat" % i
                if StoreHelper.is_file_exist(result_dict_file):
                    result_dict = StoreHelper.load_data(result_dict_file, {})
                    feature_dict[i] = [result_dict[feature][words] if words in result_dict[feature] else 0 for words in feature_list]
            feature_vector_dict[feature] = feature_dict
        # print (feature_vector_dict.keys())
        # print (str([len(value[1]) for value in feature_vector_dict.values()]))
        StoreHelper.store_data(feature_vector_dict, 'feature_vector.dat')
        StoreHelper.save_file(feature_vector_dict, 'feature_vector.txt')
Пример #2
0
 def get_only_words_in_5():
     for i in range(8535):
         result_dict = {}
         words_dict_file = "./data/result_dict/%04d.dat" % i
         tfidf_dict_file = "./data/tfidf-dat/%04d.dat" % i
         if StoreHelper.is_file_exist(tfidf_dict_file):
             tfidf_dict = StoreHelper.load_data(tfidf_dict_file, {})
             words_dict = StoreHelper.load_data(words_dict_file, {})
             for _type in words_dict.keys():
                 result_dict[_type] = {}
                 for word in words_dict[_type]:
                     if word in tfidf_dict:
                         result_dict[_type][word] = tfidf_dict[word]
                     else:
                         normal_word = SegmentHelper.normalize(word)
                         if normal_word in tfidf_dict:
                             print ("Saved by normalize for %s" % normal_word)
                             result_dict[_type][word] = tfidf_dict[normal_word]
                         else:
                             print ("%s not found in %s" % (word, tfidf_dict_file))
             # for _type in result_dict.keys():
             #     result_dict[_type] = DictHelper.get_sorted_list(result_dict[_type])
             # print (result_dict.keys())
             StoreHelper.store_data(result_dict, "./data/words_only/data/%04d.dat" % i)
             StoreHelper.save_file(result_dict, "./data/words_only/text/%04d.txt" % i)
Пример #3
0
    def get_company_rank():
        company_rank_dict = {}
        us_list_company_data_file = './resource/company_list.dat'
        fortune_500_company_data_file = './resource/fortune-500.dat'
        posting_company_data_file = 'company_name.dic'

        posting_company_dict = StoreHelper.load_data(posting_company_data_file,
                                                     {})
        us_list_company_dict = StoreHelper.load_data(us_list_company_data_file,
                                                     {})
        fortune_500_company_dict = StoreHelper.load_data(
            fortune_500_company_data_file, {})

        for company_name in posting_company_dict.values():
            rank = 3  # default normal company
            for company in fortune_500_company_dict:
                if TextHelper.word_in_phrase(company_name, company):
                    rank = 1
            if rank == 3:
                for company in us_list_company_dict:
                    if TextHelper.word_in_phrase(company_name, company):
                        rank = 2
            company_rank_dict[company_name] = rank
        StoreHelper.store_data(company_rank_dict, 'company_rank.dic')
        print(
            DictHelper.get_sorted_list(company_rank_dict,
                                       sorted_by_key=False,
                                       reverse=False))
Пример #4
0
    def compute_center_point(exclude_post=[1404, 3721, 4337, 2085, 7246], select_feature=None):
        position_vectors = StoreHelper.load_data('./data/position_vector_01.dat', {})
        for index in exclude_post:
            if index in position_vectors:
                del position_vectors[index]
        vector_list = StoreHelper.load_data('vector.dat', [])

        vector_dict = {'working-year': vector_list[0], 'education': vector_list[1], 'major': vector_list[2],
                       'skills': vector_list[3], 'responsibility': vector_list[4]}
        vector_length = [len(item_list) for item_list in vector_list]
        vector_length_dict = {'working-year': (0, sum(vector_length[:1])),
                              'education': (sum(vector_length[:1]), sum(vector_length[:2])),
                              'major': (sum(vector_length[:2]), sum(vector_length[:3])),
                              'skills': (sum(vector_length[:3]), sum(vector_length[:4])),
                              'responsibility': (sum(vector_length[:4]), sum(vector_length[:5]))}

        csv_index = position_vectors.keys()

        if select_feature is None:
            csv_column = []
            for item_list in vector_list:
                csv_column.extend(item_list)
            csv_data = position_vectors.values()
            csv_file = 'center_point.csv'
        else:
            start, end = vector_length_dict[select_feature]
            csv_column = vector_dict[select_feature]
            csv_data = [position[start: end] for position in position_vectors.values()]
            csv_file = '%s_center_point.csv' % select_feature
        center_point = [0 for i in range(len(csv_column))]
        for position in csv_data:
            for i in range(len(center_point)):
                center_point[i] += position[i]
        center_point = [value / len(position_vectors) for value in center_point]
        print ("Center point: %s" % str(center_point))
        StoreHelper.store_data(center_point, 'center_point.dat')
        center_dict = {csv_column[i]: center_point[i] for i in range(len(csv_column))}
        print (center_dict)
        center_list = DictHelper.get_sorted_list(center_dict, sorted_by_key=False)
        print (center_list)
        Main.write_list_to_csv(csv_file, [pair[0] for pair in center_list], [[pair[1] for pair in center_list]])

        max_distance = (0, 0)
        for i in range(len(csv_data)):
            distance = Main.compute_distance(center_point, csv_data[i])
            if distance > max_distance[1]:
                max_distance = (csv_index[i], distance)
        print("max distance: %s" % str(max_distance))
Пример #5
0
 def get_frequency_from_file(file_name):
     _html_list = StoreHelper.load_data(file_name, [])
     _dict = {}
     for _url, _web_source in _html_list:
         clean_content = HTMLHelper.remove_tag(_web_source)
         _dict.update(WordFrequency.get_frequency_dict(clean_content))
     return _dict
Пример #6
0
 def extract_company_name():
     crawl_dict = Main.parse_file("./resource/url_list")
     company_name_dict = {}
     total_numbers = 0
     for location in crawl_dict.keys():
         file_name = "./data/post/%s.dat" % location
         positions = StoreHelper.load_data(file_name, [])
         print("Find %i record in %s" % (len(positions), file_name))
         for url, position in positions:
             print("work on position: %4d" % total_numbers)
             company_list = HTMLHelper.get_company_name(position)
             if len(company_list) == 0:
                 print(
                     "Can not found company name in position %d url is %s" %
                     (total_numbers, url))
             elif len(company_list) == 1:
                 company_name_dict[total_numbers] = SegmentHelper.normalize(
                     company_list[0])
                 print("Found company name %s for position %d" %
                       (company_list[0], total_numbers))
             else:
                 company_name_dict[total_numbers] = SegmentHelper.normalize(
                     company_list[0])
                 print(
                     "Found multi company name %s for position %d (choose the first one)"
                     % (str(company_list), total_numbers))
             total_numbers += 1
     StoreHelper.save_file(company_name_dict, "company_name.txt")
     StoreHelper.store_data(company_name_dict, "company_name.dic")
     print("In summary, total downloaded %i records!" % total_numbers)
Пример #7
0
 def generate_company_list():
     company_name_dict = StoreHelper.load_data('company_name.dic', {})
     company_dict = {}
     for company_name in company_name_dict.values():
         DictHelper.increase_dic_key(company_dict, company_name)
     print ("Totally %d company" % len(company_dict.keys()))
     StoreHelper.save_file(DictHelper.get_sorted_list(company_dict), "company_dict.txt")
Пример #8
0
    def run_script(src_folder,
                   dst_folder,
                   threshold,
                   probability_dict_path=None,
                   generate_dict=True):
        if probability_dict_path is None:
            probability_dict_path = path.join(dst_folder, 'probability.dict')
        if generate_dict is True:
            file_content_list = []
            for i in range(8535):
                input_file = path.join(src_folder, "%04d.dat" % i)
                if StoreHelper.is_file_exist(input_file):
                    file_content_list.append(StoreHelper.read_file(input_file))
                else:
                    print("%s not exist!" % input_file)
            probability_dict = SegmentHelper.generate_probability_dict(
                file_content_list)
            StoreHelper.store_data(probability_dict, probability_dict_path)
            print("Finished generate user dict")
        else:
            probability_dict = StoreHelper.load_data(probability_dict_path, {})
            print("Load dict from file, %i records in dict" %
                  len(probability_dict))

        for i in range(8535):
            input_file = path.join(src_folder, "%04d.dat" % i)
            if StoreHelper.is_file_exist(input_file):
                output_file = path.join(dst_folder, "%04d.dat" % i)
                file_content = StoreHelper.read_file(input_file)
                word_list = []
                for line in file_content.splitlines():
                    word_list.extend(
                        SegmentHelper.phase_segment(probability_dict, line,
                                                    threshold))
                StoreHelper.save_file(os.linesep.join(word_list), output_file)
Пример #9
0
    def find_position_candidate(position_index,
                                threshold,
                                feature_weight_dict=None):
        if feature_weight_dict is None:
            feature_weight_dict = {
                'years': 0.25,
                'education': 0.25,
                'major': 0.25,
                'skills': 0.25
            }
        profile_vector = StoreHelper.load_data('profile_vector_common.dat', [])
        position_vector = StoreHelper.load_data('position_vector_common.dat',
                                                [])
        index_dict = StoreHelper.load_data('index_dict.dat', {})

        if position_index is None:
            max_distance = []
            count = 0
            total_account = len(position_vector)
            for position in position_vector[:30]:
                print("total position %d now is %d" % (total_account, count))
                count += 1
                distance_list = [
                    Main.generate_match_ratio(position, profile,
                                              feature_weight_dict)
                    for profile in profile_vector
                ]
                max_distance.append(max(distance_list))
            print(max_distance)
            print("max distance %f" % max(max_distance))
            print("Totally %d profile meet requirements" % sum([
                1 if distance > threshold else 0 for distance in max_distance
            ]))
        else:
            position = position_vector[index_dict[position_index]]
            print("Position: %s" % str(position))
            distance_list = [
                Main.generate_match_ratio(position, profile,
                                          feature_weight_dict)
                for profile in profile_vector
            ]
            print(distance_list)
            print("max distance %f" % max(distance_list))
            print("Totally %d profile meet requirements" % sum([
                1 if distance > threshold else 0 for distance in distance_list
            ]))
Пример #10
0
    def generate_csv_file(value_with_01, file_name='feature', select_feature=None):
        vector_list = StoreHelper.load_data('vector.dat', [])
        # Generate csv column
        csv_column = ['cluster_number', 'position_number']
        if select_feature is None:
            for item_list in vector_list:
                for item in item_list:
                    csv_column.append(item)
        else:
            vector_dict = {'working-year': vector_list[0], 'education': vector_list[1], 'major': vector_list[2],
                           'skills': vector_list[3], 'responsibility': vector_list[4]}
            vector_length = [len(item_list) for item_list in vector_list]
            vector_length_dict = {'working-year': (0, sum(vector_length[:1])),
                                  'education': (sum(vector_length[:1]), sum(vector_length[:2])),
                                  'major': (sum(vector_length[:2]), sum(vector_length[:3])),
                                  'skills': (sum(vector_length[:3]), sum(vector_length[:4])),
                                  'responsibility': (sum(vector_length[:4]), sum(vector_length[:5]))}
            start, end = vector_length_dict[select_feature]
            csv_column.extend(vector_dict[select_feature])

        # Generate data
        data_dict = StoreHelper.load_data('./data/position_vector_01.dat', {})
        print ("data_dict row=%d, column=%d" % (len(data_dict), len(data_dict[1])))
        tag_dict = StoreHelper.load_data('position_tag.dat', {})

        # tag dict record {0: [1,4], 2: [2,3]}
        tag_dict = {key: value for key, value in tag_dict.items() if len(value) > 50}
        print ("Tag dict keys after filter: %s" % (str(tag_dict.keys())))
        for key in tag_dict:
            data_column = []
            for number in tag_dict[key]:
                row_value = [int(key), number]
                if select_feature is not None:
                    row_value.extend(data_dict[number][start: end])
                else:
                    row_value.extend(data_dict[number])
                data_column.append(row_value)
            print("data_column row=%d, column=%d" % (len(data_column), len(data_column[1])))
            if select_feature is not None:
                show_vector_list = [vector_dict[select_feature]]
            else:
                show_vector_list = vector_list
            sort_csv_column, sort_data_column = Main.sort_column(csv_column, data_column, show_vector_list, 2, value_with_01)
            print("sort_data_column row=%d, column=%d" % (len(sort_data_column), len(sort_data_column[1])))
            Main.write_list_to_csv('%s_class_%d.csv' % (file_name, key), sort_csv_column, sort_data_column)
Пример #11
0
 def view_downloaded_data():
     crawl_dict = Main.parse_file("./resource/url_list")
     total_numbers = 0
     for location in crawl_dict.keys():
         file_name = "./data/post/%s.dat" % location
         positions = StoreHelper.load_data(file_name, [])
         print("Find %i record in %s" % (len(positions), file_name))
         total_numbers += len(positions)
     print("In summary, total downloaded %i records!" % total_numbers)
Пример #12
0
 def test_average_skills_per_post():
     position_list = StoreHelper.load_data('position_list.dat', [])
     skill_number_list = [
         len(post['skills']) if 'skills' in post else 0
         for post in position_list
     ]
     print(skill_number_list)
     print("total position number %d, average %f skills per post!" %
           (len(position_list),
            sum(skill_number_list) * 1.0 / len(position_list)))
Пример #13
0
 def cross():
     profile_list = StoreHelper.load_data('./resource/convert_profile.dat',
                                          [])
     position_dict = StoreHelper.load_data("./data/position_vector_01.dat",
                                           {})
     print(len(position_dict.values()[0]))
     vector_list = StoreHelper.load_data('vector.dat', [])
     print(sum([len(value) for value in vector_list]))
     vector_dict = {
         'years': vector_list[0],
         'education': vector_list[1],
         'major': vector_list[2],
         'skills': vector_list[3],
         'responsibility': vector_list[4]
     }
     vector_length = [len(item_list) for item_list in vector_list]
     vector_length_dict = {
         'years': (0, sum(vector_length[:1])),
         'education': (sum(vector_length[:1]), sum(vector_length[:2])),
         'major': (sum(vector_length[:2]), sum(vector_length[:3])),
         'skills': (sum(vector_length[:3]), sum(vector_length[:4])),
         'responsibility': (sum(vector_length[:4]), sum(vector_length[:5]))
     }
     position_list = []
     index_dict = {}
     count = 0
     for index, position in position_dict.items():
         index_dict[count] = index
         count += 1
         position_phrase_dict = {}
         for feature in vector_dict:
             start, end = vector_length_dict[feature]
             for i in range(len(vector_dict[feature])):
                 if position[start + i] > 0:
                     DictHelper.append_dic_key(position_phrase_dict,
                                               feature,
                                               vector_dict[feature][i])
         position_list.append(position_phrase_dict)
     StoreHelper.store_data(index_dict, 'index_dict.dat')
     StoreHelper.store_data(position_list, 'position_list.dat')
     for feature in ['years', 'education', 'major', 'skills']:
         Main.generate_feature_vector(feature, profile_list, position_list)
Пример #14
0
 def convert_position():
     skills_dict = StoreHelper.load_data("./resource/skills.dat", {})
     print ("Get %i words from %s" %(len(skills_dict), "skills dict"))
     discipline_dict = StoreHelper.load_data("./resource/discipline.dat", {})
     print("Get %i words from %s" % (len(discipline_dict), "discipline_dict"))
     education_dict = StoreHelper.load_data("./resource/education.dat", {})
     print("Get %i words from %s" % (len(education_dict), "education_dict"))
     responsibility_dict = StoreHelper.load_data("./resource/responsibility.dat", {})
     print("Get %i words from %s" % (len(responsibility_dict), "responsibility_dict"))
     for i in range(8535):
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             print ("working on file %s" % text_file)
             word_list = StoreHelper.load_data("./data/gensim_split/%04d.dat" % i, [])
             word_data = "./data/result_dict/%04d.dat" % i
             word_text = "./data/result_dict/%04d.txt" % i
             context = StoreHelper.read_file(text_file)
             position_helper = PositionHelper(context, word_list)
             result_dict = position_helper.convert(skills_dict, discipline_dict, education_dict, responsibility_dict, './resource/year_convert.dat')
             StoreHelper.save_file(result_dict, word_text)
             StoreHelper.store_data(result_dict, word_data)
Пример #15
0
 def generate_phase_list():
     probability_dict = StoreHelper.load_data('./data/probability.dic', {})
     print ("Get %i dict from file" % len(probability_dict))
     for i in range(8535):
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             word_file = "./data/phrase_split/%04d.dat" % i
             context = StoreHelper.read_file(text_file)
             position_helper = PositionHelper(context)
             position_dict_list = position_helper.convert_2(probability_dict)
             StoreHelper.save_file("\n".join([str(item) for item in position_dict_list]), word_file)
         else:
             print ("%s not exist!" % text_file)
Пример #16
0
 def get_tfidf():
     blob_dict_list = Main.generate_blob_list()
     profile_dict_list = StoreHelper.load_data('./resource/merged_profile.dat', [])
     blob_dict_list.extend(profile_dict_list)
     tfidf = TFIDF(blob_dict_list)
     j = 0
     for i in range(8535):
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             print("Working on %s article!" % text_file)
             tf_idf_dict = tfidf.get_tf_idf(blob_dict_list[j])
             StoreHelper.store_data(tf_idf_dict, "./data/tfidf-dat/%04d.dat" % i)
             StoreHelper.save_file(DictHelper.get_sorted_list(tf_idf_dict), "./data/tfidf/%04d.dat" % i)
             j += 1
Пример #17
0
    def get_post_vector():
        year_list = []
        education_list = []
        major_list = []
        skill_list = []
        responsibility_list = []
        position_tfidf_dict = {}
        for i in range(8535):
            phrase_dict_file = "./data/words_only/data/%04d.dat" % i
            if StoreHelper.is_file_exist(phrase_dict_file):
                phrase_dict = StoreHelper.load_data(phrase_dict_file, {})
                position_tfidf_dict[i] = phrase_dict
                if 'working-year' in phrase_dict:
                    year_list.extend(phrase_dict['working-year'].keys())
                if 'education' in phrase_dict:
                    education_list.extend(phrase_dict['education'].keys())
                if 'major' in phrase_dict:
                    major_list.extend(phrase_dict['major'].keys())
                if 'skills' in phrase_dict:
                    skill_list.extend(phrase_dict['skills'].keys())
                if 'responsibility' in phrase_dict:
                    responsibility_list.extend(phrase_dict['responsibility'].keys())
        year_list = list(set(year_list))
        print ("year list count: %d" % len(year_list))
        education_list = list(set(education_list))
        print("education_list list count: %d" % len(education_list))
        major_list = list(set(major_list))
        print("major_list list count: %d" % len(major_list))
        skill_list = list(set(skill_list))
        print("skill_list list count: %d" % len(skill_list))
        responsibility_list = list(set(responsibility_list))
        print("responsibility_list list count: %d" % len(responsibility_list))
        StoreHelper.store_data([year_list, education_list, major_list, skill_list, responsibility_list], 'vector.dat')

        position_vectors = {}
        for i in range(8535):
            if i in position_tfidf_dict:
                position = []
                for word in year_list:
                    position.append(0 if word not in position_tfidf_dict[i]['working-year'] else position_tfidf_dict[i]['working-year'][word])
                for word in education_list:
                    position.append(0 if word not in position_tfidf_dict[i]['education'] else position_tfidf_dict[i]['education'][word])
                for word in major_list:
                    position.append(0 if word not in position_tfidf_dict[i]['major'] else position_tfidf_dict[i]['major'][word])
                for word in skill_list:
                    position.append(0 if word not in position_tfidf_dict[i]['skills'] else position_tfidf_dict[i]['skills'][word])
                for word in responsibility_list:
                    position.append(0 if word not in position_tfidf_dict[i]['responsibility'] else position_tfidf_dict[i]['responsibility'][word])
                position_vectors[i] = position
        StoreHelper.store_data(position_vectors, './data/position_vector_01.dat')
Пример #18
0
    def run_cluster():
        final_vector = [[0 for j in range(310)] for i in range(4980)]
        key_set = StoreHelper.load_data("./resource/feature.dat", {}).keys()
        print("key set length: %i" % len(key_set))

        blob_dict_list = []
        skills_dict = StoreHelper.load_data("./resource/skills.dat", {})
        discipline_dict = StoreHelper.load_data("./resource/discipline.dat", {})
        education_dict = StoreHelper.load_data("./resource/education.dat", {})
        for i in range(4980):
            text_file = "./data/datascientist/%04d.txt" % i
            context = StoreHelper.read_file(text_file)
            position_helper = PositionHelper(context)
            blob_dict_list.append(position_helper.convert(skills_dict, discipline_dict, education_dict)[4])

        tfidf = TFIDF(blob_dict_list)
        for i in range(4980):
            print("Working on %i article!" % i)
            tf_idf_dict = tfidf.get_tf_idf(blob_dict_list[i])
            # tf_idf_dict = {key: "%.6f" % value for key, value in tf_idf_dict.items()}
            for j in range(310):
                if key_set[j] in tf_idf_dict:
                    final_vector[i][j] = tf_idf_dict[key_set[j]]
        StoreHelper.store_data(final_vector, "./data/vectors.dat")
Пример #19
0
 def generate_all_text():
     crawl_dict = StoreHelper.parse_file("./resource/url_list")
     count_numbers = 0
     for location in crawl_dict.keys():
         file_name = "./data/post/%s.dat" % location
         positions = StoreHelper.load_data(file_name, [])
         for url, web_source in positions:
             if 'data scientist' in web_source.lower():
                 text_content = HTMLHelper.get_text(web_source)
                 # text_dict = WordFrequency.get_frequency_dict(text_content)
                 # output = [str(item) for item in text_dict]
                 # output.extend([" ", text_content, " ",  url])
                 StoreHelper.save_file(text_content, "./data/datascientist/%04d.txt" % count_numbers)
                 count_numbers += 1
             else:
                 print ("Data Scientist not found in %s!" % url)
Пример #20
0
 def convert_skill_100():
     skills_list = StoreHelper.load_data("position_profile_skills.dat", [])
     skills_convert_dict = {}
     prefered_list = [
         "analysis", "python", "r", "analytics", "machine learning", "sql",
         "modeling", "big data", "hadoop", "java", "statistics",
         "mathematics", "sas", "data mining", "processing", "spark",
         "security", "visualization", "testing", "c", "access",
         "optimization", "hive", "integration", "excel", "tableau",
         "scripting", "development", "scala", "matlab", "linux", "nosql",
         "management", "intelligence", "aws", "regression", "spss", "pig",
         "clustering", "saas", "oracle", "go", "physics", "classification",
         "javascript", "operations research", "mapreduce", "forecasting",
         "engineering", "powerpoint", "automation", "b2b", "segmentation",
         "dashboard", "computing", "deep learning", "defense", "unix",
         "hbase", "d3", "perl", "algorithms", "advertising", "word",
         "communication", "simulation", "data collection", "hardware",
         "command", "apache", "troubleshooting", "ruby", "mongodb", "mysql",
         "probability", "hdfs", "econometrics", "data warehousing", "scrum",
         "cassandra", "databases", "git", "cluster", "statistical software",
         "manufacturing", "improvement", "pricing", "data architecture",
         "critical thinking", "html", "design", "strategy", "fraud",
         "microsoft office", "teradata", "quality assurance",
         "data integration", "experimentation", "customer service",
         "bioinformatics"
     ]
     for key in prefered_list:
         match = False
         if key not in skills_list:
             for skill in skills_list:
                 if key in skill:
                     match = True
                     if skill not in skills_convert_dict:
                         skills_convert_dict[skill] = key
                     else:
                         print("%s key duplicate" % skill)
                     break
         else:
             match = True
             skills_convert_dict[key] = key
         if not match:
             print(key)
     StoreHelper.store_data(skills_convert_dict, 'skills_convert_dict.dat')
     print(len(skills_convert_dict))
Пример #21
0
    def compute_tfidf():
        blob_dict = {}
        total_dict = {}
        probability_dict = StoreHelper.load_data('./data/probability.dic', {})
        print("Get %i dict from file" % len(probability_dict))
        for i in range(8535):
            text_file = "./data/clean_post_lemmatize/%04d.dat" % i
            if StoreHelper.is_file_exist(text_file):
                context = StoreHelper.read_file(text_file)
                position_helper = PositionHelper(context)
                blob_dict[i] = position_helper.convert_2(probability_dict)

        tfidf = TFIDF(blob_dict.values())
        for i in range(8535):
            if i in blob_dict:
                output_file = "./data/tfidf-dat/%04d.dat" % i
                print ("Working on %i article!" % i)
                tf_idf_dict = tfidf.get_tf_idf(blob_dict[i])
                DictHelper.merge_dict(total_dict, tf_idf_dict)
                tf_idf_dict = {key: float("%.6f" % value) for key, value in tf_idf_dict.items()}
                StoreHelper.store_data(tf_idf_dict, output_file)
        StoreHelper.store_data(total_dict, "./data/tfidf.dat")
Пример #22
0
 def extract_download_data():
     crawl_dict = Main.parse_file("./resource/url_list")
     total_numbers = 0
     for location in crawl_dict.keys():
         file_name = "./data/post/%s.dat" % location
         positions = StoreHelper.load_data(file_name, [])
         print("Find %i record in %s" % (len(positions), file_name))
         for url, position in positions:
             # step 1, store origin file
             # output1 = "./data/text/%04d.html" % total_numbers
             # StoreHelper.save_file(position, output1)
             output2 = "./data/clean_post_without_header/%04d.dat" % total_numbers
             print("work on position: %4d" % total_numbers)
             status, content = HTMLHelper.get_post(position)
             if status is False:
                 print("Error happen on extract %s" % url)
                 # StoreHelper.save_file(position, output2)
             else:
                 StoreHelper.save_file(HTMLHelper.post_clean(content),
                                       output2)
             total_numbers += 1
     print("In summary, total downloaded %i records!" % total_numbers)
Пример #23
0
 def generate_blob_list():
     blob_list = []
     for i in range(8535):
         phrase_dict_file = "./data/result_dict/%04d.dat" % i
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(phrase_dict_file):
             phrase_dict = StoreHelper.load_data(phrase_dict_file, {})
             text_content = StoreHelper.read_file(text_file)
             word_list = []
             for line in text_content.splitlines():
                 if line.endswith('.'):
                     line = line[:-1]
                 for word in line.split(' '):
                     word_list.append(word)
             for _type in phrase_dict.keys():
                 for words in phrase_dict[_type]:
                     for word in words.split(' '):
                         if word in word_list:
                             word_list.remove(word)
                     word_list.append(words)
             blob_list.append(DictHelper.dict_from_count_list(word_list))
     StoreHelper.store_data(blob_list, './data/blob_list.dat')
     return blob_list
Пример #24
0
 def cluster_features():
     feature_vector_dict = StoreHelper.load_data('feature_vector.dat', {})
     for feature in feature_vector_dict:
         print ("Running cluster for %s" % feature)
         Main.cluster_with_birch(feature_vector_dict[feature])
         Main.generate_csv_file(value_with_01=True, file_name=feature, select_feature=feature)
Пример #25
0
    def generate_profile_position_common():
        print("step 1, generate common feature")
        common_feature_dict = {}
        for feature in ['years', 'education', 'major', 'skills']:
            common_feature_dict[feature] = StoreHelper.load_data(
                "position_profile_%s.dat" % feature, [])
            print("%s: %s" % (feature, common_feature_dict[feature]))
            print("Load %d phrase for %s" %
                  (len(common_feature_dict[feature]), feature))

        print("step 2, generate vector for post and profile")
        profile_list = StoreHelper.load_data('./resource/convert_profile.dat',
                                             [])
        print("sample: %s" % profile_list[0])
        total_profile = len(profile_list)
        print("Load %d profile from file" % total_profile)
        position_list = StoreHelper.load_data('position_list.dat', [])
        print("sample: %s" % position_list[0])
        total_position = len(position_list)
        print("Load %d position from file" % total_position)

        skills_convert_dict = StoreHelper.load_data('skills_convert_dict.dat',
                                                    {})
        print("Load %d skill convert dict from file" %
              len(skills_convert_dict))

        profile_vector = []
        position_vector = []
        count = 0
        for profile in profile_list:
            print("Work on profile %d totally %d" % (count, total_profile))
            count += 1
            if 'skills' in profile:
                print("skills before convert number: %d" %
                      len(profile['skills']))
                new_skill_set = []
                for skill in profile['skills']:
                    if skill in skills_convert_dict:
                        new_skill_set.append(skill)
                profile['skills'] = list(set(new_skill_set))
                print("skills after convert number: %d" %
                      len(profile['skills']))
                profile_dict = {
                    feature: []
                    for feature in common_feature_dict.keys()
                }
            for feature in common_feature_dict:
                if feature in profile:
                    for phrase in common_feature_dict[feature]:
                        profile_dict[feature].append(1 if phrase in
                                                     profile[feature] else 0)
                else:
                    profile_dict[feature] = [
                        0 for i in range(len(common_feature_dict[feature]))
                    ]
            profile_vector.append(profile_dict)

        count = 0
        for position in position_list:
            print("Work on position %d totally %d" % (count, total_position))
            count += 1
            if 'skills' in position:
                print("skills before convert number: %d" %
                      len(position['skills']))
                new_skill_set = []
                for skill in position['skills']:
                    if skill in skills_convert_dict:
                        new_skill_set.append(skill)
                position['skills'] = list(set(new_skill_set))
                print("skills after convert number: %d" %
                      len(position['skills']))
            position_dict = {
                feature: []
                for feature in common_feature_dict.keys()
            }
            for feature in common_feature_dict:
                if feature in position:
                    for phrase in common_feature_dict[feature]:
                        position_dict[feature].append(1 if phrase in
                                                      position[feature] else 0)
                else:
                    position_dict[feature] = [
                        0 for i in range(len(common_feature_dict[feature]))
                    ]
            position_vector.append(position_dict)

        print("step 3, store into data file")
        print("Profile sample: %s" % str(profile_vector[0]))
        print("Position sample: %s" % str(position_vector[0]))
        StoreHelper.store_data(profile_vector, 'profile_vector_common.dat')
        StoreHelper.store_data(position_vector, 'position_vector_common.dat')
Пример #26
0
 def cluster_with_birch(position_dict=None):
     if position_dict is None:
         position_dict = StoreHelper.load_data("./data/position_vector_01.dat", {})
     _vector_list = position_dict.values()
     _index_list = position_dict.keys()
     ClusterHelper.birch_cluster(_vector_list, _index_list)
Пример #27
0
 def generate_feature_list():
     vector_data = StoreHelper.load_data('vector.dat', [])
     vector_dict = {'year': vector_data[0], 'education': vector_data[1], 'major': vector_data[2],
                    'skill': vector_data[3], 'responsibility': vector_data[4]}
     StoreHelper.save_file(vector_dict, 'vector.txt')
Пример #28
0
 def get_all_job_post(url_file, post_file):
     post_info_list = []
     for url in StoreHelper.load_data(url_file, {}):
         web_content = CrawlHelper.get_web_source(url)
         post_info_list.append((url, web_content))
     StoreHelper.store_data(post_info_list, post_file)