if __name__ == '__main__': c = CosineSimilarity() t = TextProcessing() f = FileProcessor('experience_classification', 'train') data, label = f.cleanFile() feature_in_category = t.get_feature_in_category(data, label) local_neighbour = t.get_local_neighbours(feature_in_category) global_neighbour = t.get_global_neighbours(feature_in_category) global_words = [global_neighbour[i][0] for i in global_neighbour] sorted_local_neighbours = t.get_local_neighbours_sorted(feature_in_category) # for i in sorted_local_neighbours: # print(sorted_local_neighbours[i]) ## Training Data: 4-6% revised_data = model_construction(data, global_words, sorted_local_neighbours) # for i in revised_data: # print(i) vocabSet = c.vocabSet(revised_data) bagOfWords = [c.bag_of_words(vocabSet, i) for i in revised_data] # print(training_classification(revised_data, label, bagOfWords, k=3)) ## Test Data: 4-6% test_file = FileProcessor('experience_unclassified', 'test') test_data = test_file.cleanFile() revised_test_data = model_construction(test_data, global_words, sorted_local_neighbours) # test_vocabSet = c.vocabSet(revised_data) test_bagOfWords = [c.bag_of_words(vocabSet, i) for i in revised_test_data] # the test bag of words still uses original vocabset # print(classification(revised_test_data, test_bagOfWords, revised_data, label, bagOfWords))
ratings_file_path = '/Users/Rahul/Desktop/Main/Side_projects/project_2/lifeline/Scripts/link_new/files/linkedin_rating_backend' ratings_file = FileProcessor(ratings_file_path, 'train') ratings = ratings_file.readByteFile() ratings = [i for i in ratings if i != '\n' and i != None] big_list = [] r = [] for i in ratings: edu = get_education(i) # exp = get_experience(i) rat = i.get('rating') big_list.append(edu) # big_list.append(exp) r.append(rat) vocabSet = c.vocabSet(big_list) wordVectors = [c.bag_of_words(vocabSet, i) for i in big_list] # print(calculate_rating_dict(items[4], r, wordVectors, vocabSet)) # duplicate(byte_file_path, ratings_file_path) # li = [ # {'education': ['University of Waterloo', 'Degree Name', 'BCS', 'Field Of Study', 'Computer Science'], 'rating': [5, 5, 10], 'experience': ['Software Engineering Intern', 'MemSQL'], 'header': ['Jacob Jackson', '--', 'https://www.linkedin.com/in/jacobbfjackson/']}, # {'education': ['University of Toronto', 'Degree Name', 'PhD', 'Field Of Study', 'Computer Science'], 'rating': [4, 2, 6], 'experience': ['Lecturer', 'University of Toronto', 'Research Assistant', 'University of Toronto', 'Teaching Assistant', 'University of Toronto', 'Summer Intern', 'Greenplum', 'Teaching Assistant', 'University of Toronto'], 'header': ['Bogdan Simion', 'Lecturer at University of Toronto', 'https://www.linkedin.com/in/bogdan-simion-1113b27/']} # ] # with open(byte_file_path, 'ab') as f: # for i in li: # pickle.dump(i, f) # pickle.dump('\n', f) # get_ratings(ratings_file_path) # edu = get_education(items[1])