Пример #1
0
    def compute_center_point(exclude_post=[1404, 3721, 4337, 2085, 7246], select_feature=None):
        position_vectors = StoreHelper.load_data('./data/position_vector_01.dat', {})
        for index in exclude_post:
            if index in position_vectors:
                del position_vectors[index]
        vector_list = StoreHelper.load_data('vector.dat', [])

        vector_dict = {'working-year': vector_list[0], 'education': vector_list[1], 'major': vector_list[2],
                       'skills': vector_list[3], 'responsibility': vector_list[4]}
        vector_length = [len(item_list) for item_list in vector_list]
        vector_length_dict = {'working-year': (0, sum(vector_length[:1])),
                              'education': (sum(vector_length[:1]), sum(vector_length[:2])),
                              'major': (sum(vector_length[:2]), sum(vector_length[:3])),
                              'skills': (sum(vector_length[:3]), sum(vector_length[:4])),
                              'responsibility': (sum(vector_length[:4]), sum(vector_length[:5]))}

        csv_index = position_vectors.keys()

        if select_feature is None:
            csv_column = []
            for item_list in vector_list:
                csv_column.extend(item_list)
            csv_data = position_vectors.values()
            csv_file = 'center_point.csv'
        else:
            start, end = vector_length_dict[select_feature]
            csv_column = vector_dict[select_feature]
            csv_data = [position[start: end] for position in position_vectors.values()]
            csv_file = '%s_center_point.csv' % select_feature
        center_point = [0 for i in range(len(csv_column))]
        for position in csv_data:
            for i in range(len(center_point)):
                center_point[i] += position[i]
        center_point = [value / len(position_vectors) for value in center_point]
        print ("Center point: %s" % str(center_point))
        StoreHelper.store_data(center_point, 'center_point.dat')
        center_dict = {csv_column[i]: center_point[i] for i in range(len(csv_column))}
        print (center_dict)
        center_list = DictHelper.get_sorted_list(center_dict, sorted_by_key=False)
        print (center_list)
        Main.write_list_to_csv(csv_file, [pair[0] for pair in center_list], [[pair[1] for pair in center_list]])

        max_distance = (0, 0)
        for i in range(len(csv_data)):
            distance = Main.compute_distance(center_point, csv_data[i])
            if distance > max_distance[1]:
                max_distance = (csv_index[i], distance)
        print("max distance: %s" % str(max_distance))
Пример #2
0
 def convert_skill_100():
     skills_list = StoreHelper.load_data("position_profile_skills.dat", [])
     skills_convert_dict = {}
     prefered_list = [
         "analysis", "python", "r", "analytics", "machine learning", "sql",
         "modeling", "big data", "hadoop", "java", "statistics",
         "mathematics", "sas", "data mining", "processing", "spark",
         "security", "visualization", "testing", "c", "access",
         "optimization", "hive", "integration", "excel", "tableau",
         "scripting", "development", "scala", "matlab", "linux", "nosql",
         "management", "intelligence", "aws", "regression", "spss", "pig",
         "clustering", "saas", "oracle", "go", "physics", "classification",
         "javascript", "operations research", "mapreduce", "forecasting",
         "engineering", "powerpoint", "automation", "b2b", "segmentation",
         "dashboard", "computing", "deep learning", "defense", "unix",
         "hbase", "d3", "perl", "algorithms", "advertising", "word",
         "communication", "simulation", "data collection", "hardware",
         "command", "apache", "troubleshooting", "ruby", "mongodb", "mysql",
         "probability", "hdfs", "econometrics", "data warehousing", "scrum",
         "cassandra", "databases", "git", "cluster", "statistical software",
         "manufacturing", "improvement", "pricing", "data architecture",
         "critical thinking", "html", "design", "strategy", "fraud",
         "microsoft office", "teradata", "quality assurance",
         "data integration", "experimentation", "customer service",
         "bioinformatics"
     ]
     for key in prefered_list:
         match = False
         if key not in skills_list:
             for skill in skills_list:
                 if key in skill:
                     match = True
                     if skill not in skills_convert_dict:
                         skills_convert_dict[skill] = key
                     else:
                         print("%s key duplicate" % skill)
                     break
         else:
             match = True
             skills_convert_dict[key] = key
         if not match:
             print(key)
     StoreHelper.store_data(skills_convert_dict, 'skills_convert_dict.dat')
     print(len(skills_convert_dict))
Пример #3
0
 def cross():
     profile_list = StoreHelper.load_data('./resource/convert_profile.dat',
                                          [])
     position_dict = StoreHelper.load_data("./data/position_vector_01.dat",
                                           {})
     print(len(position_dict.values()[0]))
     vector_list = StoreHelper.load_data('vector.dat', [])
     print(sum([len(value) for value in vector_list]))
     vector_dict = {
         'years': vector_list[0],
         'education': vector_list[1],
         'major': vector_list[2],
         'skills': vector_list[3],
         'responsibility': vector_list[4]
     }
     vector_length = [len(item_list) for item_list in vector_list]
     vector_length_dict = {
         'years': (0, sum(vector_length[:1])),
         'education': (sum(vector_length[:1]), sum(vector_length[:2])),
         'major': (sum(vector_length[:2]), sum(vector_length[:3])),
         'skills': (sum(vector_length[:3]), sum(vector_length[:4])),
         'responsibility': (sum(vector_length[:4]), sum(vector_length[:5]))
     }
     position_list = []
     index_dict = {}
     count = 0
     for index, position in position_dict.items():
         index_dict[count] = index
         count += 1
         position_phrase_dict = {}
         for feature in vector_dict:
             start, end = vector_length_dict[feature]
             for i in range(len(vector_dict[feature])):
                 if position[start + i] > 0:
                     DictHelper.append_dic_key(position_phrase_dict,
                                               feature,
                                               vector_dict[feature][i])
         position_list.append(position_phrase_dict)
     StoreHelper.store_data(index_dict, 'index_dict.dat')
     StoreHelper.store_data(position_list, 'position_list.dat')
     for feature in ['years', 'education', 'major', 'skills']:
         Main.generate_feature_vector(feature, profile_list, position_list)
Пример #4
0
 def convert_position():
     skills_dict = StoreHelper.load_data("./resource/skills.dat", {})
     print ("Get %i words from %s" %(len(skills_dict), "skills dict"))
     discipline_dict = StoreHelper.load_data("./resource/discipline.dat", {})
     print("Get %i words from %s" % (len(discipline_dict), "discipline_dict"))
     education_dict = StoreHelper.load_data("./resource/education.dat", {})
     print("Get %i words from %s" % (len(education_dict), "education_dict"))
     responsibility_dict = StoreHelper.load_data("./resource/responsibility.dat", {})
     print("Get %i words from %s" % (len(responsibility_dict), "responsibility_dict"))
     for i in range(8535):
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             print ("working on file %s" % text_file)
             word_list = StoreHelper.load_data("./data/gensim_split/%04d.dat" % i, [])
             word_data = "./data/result_dict/%04d.dat" % i
             word_text = "./data/result_dict/%04d.txt" % i
             context = StoreHelper.read_file(text_file)
             position_helper = PositionHelper(context, word_list)
             result_dict = position_helper.convert(skills_dict, discipline_dict, education_dict, responsibility_dict, './resource/year_convert.dat')
             StoreHelper.save_file(result_dict, word_text)
             StoreHelper.store_data(result_dict, word_data)
Пример #5
0
    def compute_tfidf():
        blob_dict = {}
        total_dict = {}
        probability_dict = StoreHelper.load_data('./data/probability.dic', {})
        print("Get %i dict from file" % len(probability_dict))
        for i in range(8535):
            text_file = "./data/clean_post_lemmatize/%04d.dat" % i
            if StoreHelper.is_file_exist(text_file):
                context = StoreHelper.read_file(text_file)
                position_helper = PositionHelper(context)
                blob_dict[i] = position_helper.convert_2(probability_dict)

        tfidf = TFIDF(blob_dict.values())
        for i in range(8535):
            if i in blob_dict:
                output_file = "./data/tfidf-dat/%04d.dat" % i
                print ("Working on %i article!" % i)
                tf_idf_dict = tfidf.get_tf_idf(blob_dict[i])
                DictHelper.merge_dict(total_dict, tf_idf_dict)
                tf_idf_dict = {key: float("%.6f" % value) for key, value in tf_idf_dict.items()}
                StoreHelper.store_data(tf_idf_dict, output_file)
        StoreHelper.store_data(total_dict, "./data/tfidf.dat")
Пример #6
0
 def generate_blob_list():
     blob_list = []
     for i in range(8535):
         phrase_dict_file = "./data/result_dict/%04d.dat" % i
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(phrase_dict_file):
             phrase_dict = StoreHelper.load_data(phrase_dict_file, {})
             text_content = StoreHelper.read_file(text_file)
             word_list = []
             for line in text_content.splitlines():
                 if line.endswith('.'):
                     line = line[:-1]
                 for word in line.split(' '):
                     word_list.append(word)
             for _type in phrase_dict.keys():
                 for words in phrase_dict[_type]:
                     for word in words.split(' '):
                         if word in word_list:
                             word_list.remove(word)
                     word_list.append(words)
             blob_list.append(DictHelper.dict_from_count_list(word_list))
     StoreHelper.store_data(blob_list, './data/blob_list.dat')
     return blob_list
Пример #7
0
    def run_cluster():
        final_vector = [[0 for j in range(310)] for i in range(4980)]
        key_set = StoreHelper.load_data("./resource/feature.dat", {}).keys()
        print("key set length: %i" % len(key_set))

        blob_dict_list = []
        skills_dict = StoreHelper.load_data("./resource/skills.dat", {})
        discipline_dict = StoreHelper.load_data("./resource/discipline.dat", {})
        education_dict = StoreHelper.load_data("./resource/education.dat", {})
        for i in range(4980):
            text_file = "./data/datascientist/%04d.txt" % i
            context = StoreHelper.read_file(text_file)
            position_helper = PositionHelper(context)
            blob_dict_list.append(position_helper.convert(skills_dict, discipline_dict, education_dict)[4])

        tfidf = TFIDF(blob_dict_list)
        for i in range(4980):
            print("Working on %i article!" % i)
            tf_idf_dict = tfidf.get_tf_idf(blob_dict_list[i])
            # tf_idf_dict = {key: "%.6f" % value for key, value in tf_idf_dict.items()}
            for j in range(310):
                if key_set[j] in tf_idf_dict:
                    final_vector[i][j] = tf_idf_dict[key_set[j]]
        StoreHelper.store_data(final_vector, "./data/vectors.dat")
Пример #8
0
    @staticmethod
    def compute_tfidf(token_file_dict):
        tfidf = TfidfVectorizer(tokenizer=NltkHelper.tokenize,
                                stop_words='english')
        tfs = tfidf.fit_transform(token_file_dict)
        return tfidf, tfs

    @staticmethod
    def generate_token_dict(text_file_list):
        token_file_dict = {}
        for text_file in text_file_list:
            file_name = ntpath.basename(text_file)
            if StoreHelper.is_file_exist(text_file):
                file_content = StoreHelper.read_file(text_file)
                lowers = file_content.lower()
                no_punctuation = lowers.translate(None, string.punctuation)
                token_file_dict[file_name] = no_punctuation
        return token_file_dict


if __name__ == '__main__':
    text_file_list = [
        "../data/clean_post_without_header/%04d.dat" % i for i in range(35)
    ]
    _token_file_dict = NltkHelper.generate_token_dict(text_file_list)
    _tfidf, _tfs = NltkHelper.compute_tfidf(_token_file_dict)
    StoreHelper.store_data(_tfs, "../data/nltk/tfs.dat")
    feature_names = _tfidf.get_feature_names()
    for col in _tfs.nonzero()[1]:
        print feature_names[col], ' - ', _tfs[0, col]
Пример #9
0
    def generate_profile_position_common():
        print("step 1, generate common feature")
        common_feature_dict = {}
        for feature in ['years', 'education', 'major', 'skills']:
            common_feature_dict[feature] = StoreHelper.load_data(
                "position_profile_%s.dat" % feature, [])
            print("%s: %s" % (feature, common_feature_dict[feature]))
            print("Load %d phrase for %s" %
                  (len(common_feature_dict[feature]), feature))

        print("step 2, generate vector for post and profile")
        profile_list = StoreHelper.load_data('./resource/convert_profile.dat',
                                             [])
        print("sample: %s" % profile_list[0])
        total_profile = len(profile_list)
        print("Load %d profile from file" % total_profile)
        position_list = StoreHelper.load_data('position_list.dat', [])
        print("sample: %s" % position_list[0])
        total_position = len(position_list)
        print("Load %d position from file" % total_position)

        skills_convert_dict = StoreHelper.load_data('skills_convert_dict.dat',
                                                    {})
        print("Load %d skill convert dict from file" %
              len(skills_convert_dict))

        profile_vector = []
        position_vector = []
        count = 0
        for profile in profile_list:
            print("Work on profile %d totally %d" % (count, total_profile))
            count += 1
            if 'skills' in profile:
                print("skills before convert number: %d" %
                      len(profile['skills']))
                new_skill_set = []
                for skill in profile['skills']:
                    if skill in skills_convert_dict:
                        new_skill_set.append(skill)
                profile['skills'] = list(set(new_skill_set))
                print("skills after convert number: %d" %
                      len(profile['skills']))
                profile_dict = {
                    feature: []
                    for feature in common_feature_dict.keys()
                }
            for feature in common_feature_dict:
                if feature in profile:
                    for phrase in common_feature_dict[feature]:
                        profile_dict[feature].append(1 if phrase in
                                                     profile[feature] else 0)
                else:
                    profile_dict[feature] = [
                        0 for i in range(len(common_feature_dict[feature]))
                    ]
            profile_vector.append(profile_dict)

        count = 0
        for position in position_list:
            print("Work on position %d totally %d" % (count, total_position))
            count += 1
            if 'skills' in position:
                print("skills before convert number: %d" %
                      len(position['skills']))
                new_skill_set = []
                for skill in position['skills']:
                    if skill in skills_convert_dict:
                        new_skill_set.append(skill)
                position['skills'] = list(set(new_skill_set))
                print("skills after convert number: %d" %
                      len(position['skills']))
            position_dict = {
                feature: []
                for feature in common_feature_dict.keys()
            }
            for feature in common_feature_dict:
                if feature in position:
                    for phrase in common_feature_dict[feature]:
                        position_dict[feature].append(1 if phrase in
                                                      position[feature] else 0)
                else:
                    position_dict[feature] = [
                        0 for i in range(len(common_feature_dict[feature]))
                    ]
            position_vector.append(position_dict)

        print("step 3, store into data file")
        print("Profile sample: %s" % str(profile_vector[0]))
        print("Position sample: %s" % str(position_vector[0]))
        StoreHelper.store_data(profile_vector, 'profile_vector_common.dat')
        StoreHelper.store_data(position_vector, 'position_vector_common.dat')
Пример #10
0
 def get_all_job_post(url_file, post_file):
     post_info_list = []
     for url in StoreHelper.load_data(url_file, {}):
         web_content = CrawlHelper.get_web_source(url)
         post_info_list.append((url, web_content))
     StoreHelper.store_data(post_info_list, post_file)