Пример #1
0
    def generate_feature_vectors():
        # step 1, generate total dict for each feature
        feature_total_dict = {}
        for i in range(8535):
            result_dict_file = "./data/words_only/data/%04d.dat" % i
            if StoreHelper.is_file_exist(result_dict_file):
                result_dict = StoreHelper.load_data(result_dict_file, {})
                for feature in result_dict:
                    DictHelper.append_dic_key(feature_total_dict, feature, result_dict[feature])

        # step 2, generate feature vector for each feature
        feature_vector_header_dict = {}
        for feature in feature_total_dict:
            feature_list = []
            for words_dict in feature_total_dict[feature]:
                feature_list.extend(words_dict.keys())
            feature_list = list(set(feature_list))
            feature_vector_header_dict[feature] = feature_list
        StoreHelper.store_data(feature_vector_header_dict, 'feature_vector_header.dat')

        # step 3, collect value for each feature vector
        feature_vector_dict = {}
        for feature in feature_vector_header_dict:
            feature_dict = {}
            feature_list = feature_vector_header_dict[feature]
            for i in range(8535):
                result_dict_file = "./data/words_only/data/%04d.dat" % i
                if StoreHelper.is_file_exist(result_dict_file):
                    result_dict = StoreHelper.load_data(result_dict_file, {})
                    feature_dict[i] = [result_dict[feature][words] if words in result_dict[feature] else 0 for words in feature_list]
            feature_vector_dict[feature] = feature_dict
        # print (feature_vector_dict.keys())
        # print (str([len(value[1]) for value in feature_vector_dict.values()]))
        StoreHelper.store_data(feature_vector_dict, 'feature_vector.dat')
        StoreHelper.save_file(feature_vector_dict, 'feature_vector.txt')
Пример #2
0
 def get_only_words_in_5():
     for i in range(8535):
         result_dict = {}
         words_dict_file = "./data/result_dict/%04d.dat" % i
         tfidf_dict_file = "./data/tfidf-dat/%04d.dat" % i
         if StoreHelper.is_file_exist(tfidf_dict_file):
             tfidf_dict = StoreHelper.load_data(tfidf_dict_file, {})
             words_dict = StoreHelper.load_data(words_dict_file, {})
             for _type in words_dict.keys():
                 result_dict[_type] = {}
                 for word in words_dict[_type]:
                     if word in tfidf_dict:
                         result_dict[_type][word] = tfidf_dict[word]
                     else:
                         normal_word = SegmentHelper.normalize(word)
                         if normal_word in tfidf_dict:
                             print ("Saved by normalize for %s" % normal_word)
                             result_dict[_type][word] = tfidf_dict[normal_word]
                         else:
                             print ("%s not found in %s" % (word, tfidf_dict_file))
             # for _type in result_dict.keys():
             #     result_dict[_type] = DictHelper.get_sorted_list(result_dict[_type])
             # print (result_dict.keys())
             StoreHelper.store_data(result_dict, "./data/words_only/data/%04d.dat" % i)
             StoreHelper.save_file(result_dict, "./data/words_only/text/%04d.txt" % i)
Пример #3
0
 def generate_company_list():
     company_name_dict = StoreHelper.load_data('company_name.dic', {})
     company_dict = {}
     for company_name in company_name_dict.values():
         DictHelper.increase_dic_key(company_dict, company_name)
     print ("Totally %d company" % len(company_dict.keys()))
     StoreHelper.save_file(DictHelper.get_sorted_list(company_dict), "company_dict.txt")
Пример #4
0
 def extract_company_name():
     crawl_dict = Main.parse_file("./resource/url_list")
     company_name_dict = {}
     total_numbers = 0
     for location in crawl_dict.keys():
         file_name = "./data/post/%s.dat" % location
         positions = StoreHelper.load_data(file_name, [])
         print("Find %i record in %s" % (len(positions), file_name))
         for url, position in positions:
             print("work on position: %4d" % total_numbers)
             company_list = HTMLHelper.get_company_name(position)
             if len(company_list) == 0:
                 print(
                     "Can not found company name in position %d url is %s" %
                     (total_numbers, url))
             elif len(company_list) == 1:
                 company_name_dict[total_numbers] = SegmentHelper.normalize(
                     company_list[0])
                 print("Found company name %s for position %d" %
                       (company_list[0], total_numbers))
             else:
                 company_name_dict[total_numbers] = SegmentHelper.normalize(
                     company_list[0])
                 print(
                     "Found multi company name %s for position %d (choose the first one)"
                     % (str(company_list), total_numbers))
             total_numbers += 1
     StoreHelper.save_file(company_name_dict, "company_name.txt")
     StoreHelper.store_data(company_name_dict, "company_name.dic")
     print("In summary, total downloaded %i records!" % total_numbers)
Пример #5
0
    def run_script(src_folder,
                   dst_folder,
                   threshold,
                   probability_dict_path=None,
                   generate_dict=True):
        if probability_dict_path is None:
            probability_dict_path = path.join(dst_folder, 'probability.dict')
        if generate_dict is True:
            file_content_list = []
            for i in range(8535):
                input_file = path.join(src_folder, "%04d.dat" % i)
                if StoreHelper.is_file_exist(input_file):
                    file_content_list.append(StoreHelper.read_file(input_file))
                else:
                    print("%s not exist!" % input_file)
            probability_dict = SegmentHelper.generate_probability_dict(
                file_content_list)
            StoreHelper.store_data(probability_dict, probability_dict_path)
            print("Finished generate user dict")
        else:
            probability_dict = StoreHelper.load_data(probability_dict_path, {})
            print("Load dict from file, %i records in dict" %
                  len(probability_dict))

        for i in range(8535):
            input_file = path.join(src_folder, "%04d.dat" % i)
            if StoreHelper.is_file_exist(input_file):
                output_file = path.join(dst_folder, "%04d.dat" % i)
                file_content = StoreHelper.read_file(input_file)
                word_list = []
                for line in file_content.splitlines():
                    word_list.extend(
                        SegmentHelper.phase_segment(probability_dict, line,
                                                    threshold))
                StoreHelper.save_file(os.linesep.join(word_list), output_file)
Пример #6
0
 def generate_phase_list():
     probability_dict = StoreHelper.load_data('./data/probability.dic', {})
     print ("Get %i dict from file" % len(probability_dict))
     for i in range(8535):
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             word_file = "./data/phrase_split/%04d.dat" % i
             context = StoreHelper.read_file(text_file)
             position_helper = PositionHelper(context)
             position_dict_list = position_helper.convert_2(probability_dict)
             StoreHelper.save_file("\n".join([str(item) for item in position_dict_list]), word_file)
         else:
             print ("%s not exist!" % text_file)
Пример #7
0
 def get_tfidf():
     blob_dict_list = Main.generate_blob_list()
     profile_dict_list = StoreHelper.load_data('./resource/merged_profile.dat', [])
     blob_dict_list.extend(profile_dict_list)
     tfidf = TFIDF(blob_dict_list)
     j = 0
     for i in range(8535):
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             print("Working on %s article!" % text_file)
             tf_idf_dict = tfidf.get_tf_idf(blob_dict_list[j])
             StoreHelper.store_data(tf_idf_dict, "./data/tfidf-dat/%04d.dat" % i)
             StoreHelper.save_file(DictHelper.get_sorted_list(tf_idf_dict), "./data/tfidf/%04d.dat" % i)
             j += 1
Пример #8
0
 def generate_all_text():
     crawl_dict = StoreHelper.parse_file("./resource/url_list")
     count_numbers = 0
     for location in crawl_dict.keys():
         file_name = "./data/post/%s.dat" % location
         positions = StoreHelper.load_data(file_name, [])
         for url, web_source in positions:
             if 'data scientist' in web_source.lower():
                 text_content = HTMLHelper.get_text(web_source)
                 # text_dict = WordFrequency.get_frequency_dict(text_content)
                 # output = [str(item) for item in text_dict]
                 # output.extend([" ", text_content, " ",  url])
                 StoreHelper.save_file(text_content, "./data/datascientist/%04d.txt" % count_numbers)
                 count_numbers += 1
             else:
                 print ("Data Scientist not found in %s!" % url)
Пример #9
0
 def convert_position():
     skills_dict = StoreHelper.load_data("./resource/skills.dat", {})
     print ("Get %i words from %s" %(len(skills_dict), "skills dict"))
     discipline_dict = StoreHelper.load_data("./resource/discipline.dat", {})
     print("Get %i words from %s" % (len(discipline_dict), "discipline_dict"))
     education_dict = StoreHelper.load_data("./resource/education.dat", {})
     print("Get %i words from %s" % (len(education_dict), "education_dict"))
     responsibility_dict = StoreHelper.load_data("./resource/responsibility.dat", {})
     print("Get %i words from %s" % (len(responsibility_dict), "responsibility_dict"))
     for i in range(8535):
         text_file = "./data/clean_post_lemmatize/%04d.dat" % i
         if StoreHelper.is_file_exist(text_file):
             print ("working on file %s" % text_file)
             word_list = StoreHelper.load_data("./data/gensim_split/%04d.dat" % i, [])
             word_data = "./data/result_dict/%04d.dat" % i
             word_text = "./data/result_dict/%04d.txt" % i
             context = StoreHelper.read_file(text_file)
             position_helper = PositionHelper(context, word_list)
             result_dict = position_helper.convert(skills_dict, discipline_dict, education_dict, responsibility_dict, './resource/year_convert.dat')
             StoreHelper.save_file(result_dict, word_text)
             StoreHelper.store_data(result_dict, word_data)
Пример #10
0
 def extract_download_data():
     crawl_dict = Main.parse_file("./resource/url_list")
     total_numbers = 0
     for location in crawl_dict.keys():
         file_name = "./data/post/%s.dat" % location
         positions = StoreHelper.load_data(file_name, [])
         print("Find %i record in %s" % (len(positions), file_name))
         for url, position in positions:
             # step 1, store origin file
             # output1 = "./data/text/%04d.html" % total_numbers
             # StoreHelper.save_file(position, output1)
             output2 = "./data/clean_post_without_header/%04d.dat" % total_numbers
             print("work on position: %4d" % total_numbers)
             status, content = HTMLHelper.get_post(position)
             if status is False:
                 print("Error happen on extract %s" % url)
                 # StoreHelper.save_file(position, output2)
             else:
                 StoreHelper.save_file(HTMLHelper.post_clean(content),
                                       output2)
             total_numbers += 1
     print("In summary, total downloaded %i records!" % total_numbers)
Пример #11
0
 def generate_feature_list():
     vector_data = StoreHelper.load_data('vector.dat', [])
     vector_dict = {'year': vector_data[0], 'education': vector_data[1], 'major': vector_data[2],
                    'skill': vector_data[3], 'responsibility': vector_data[4]}
     StoreHelper.save_file(vector_dict, 'vector.txt')