Exemplo n.º 1
0
if __name__ == '__main__':
    c = CosineSimilarity()
    t = TextProcessing()
    f = FileProcessor('experience_classification', 'train')
    data, label = f.cleanFile()
    feature_in_category = t.get_feature_in_category(data, label)
    local_neighbour = t.get_local_neighbours(feature_in_category)
    global_neighbour = t.get_global_neighbours(feature_in_category)
    global_words = [global_neighbour[i][0] for i in global_neighbour]
    sorted_local_neighbours = t.get_local_neighbours_sorted(feature_in_category)
    # for i in sorted_local_neighbours:
    #     print(sorted_local_neighbours[i])

    ## Training Data: 4-6%
    revised_data = model_construction(data, global_words, sorted_local_neighbours)
    # for i in revised_data:
    #     print(i)
    vocabSet = c.vocabSet(revised_data)
    bagOfWords = [c.bag_of_words(vocabSet, i) for i in revised_data]
    # print(training_classification(revised_data, label, bagOfWords, k=3))

    ## Test Data: 4-6%
    test_file = FileProcessor('experience_unclassified', 'test')
    test_data = test_file.cleanFile()
    revised_test_data = model_construction(test_data, global_words, sorted_local_neighbours)
    # test_vocabSet = c.vocabSet(revised_data)
    test_bagOfWords = [c.bag_of_words(vocabSet, i) for i in revised_test_data] # the test bag of words still uses original vocabset
    # print(classification(revised_test_data, test_bagOfWords, revised_data, label, bagOfWords))

    ratings_file_path = '/Users/Rahul/Desktop/Main/Side_projects/project_2/lifeline/Scripts/link_new/files/linkedin_rating_backend'
    ratings_file = FileProcessor(ratings_file_path, 'train')
    ratings = ratings_file.readByteFile()
    ratings = [i for i in ratings if i != '\n' and i != None]

    big_list = []
    r = []
    for i in ratings:
        edu = get_education(i)
        # exp = get_experience(i)
        rat = i.get('rating')
        big_list.append(edu)
        # big_list.append(exp)
        r.append(rat)

    vocabSet = c.vocabSet(big_list)
    wordVectors = [c.bag_of_words(vocabSet, i) for i in big_list]
    # print(calculate_rating_dict(items[4], r, wordVectors, vocabSet))

    # duplicate(byte_file_path, ratings_file_path)
    # li = [
    #     {'education': ['University of Waterloo', 'Degree Name', 'BCS', 'Field Of Study', 'Computer Science'], 'rating': [5, 5, 10], 'experience': ['Software Engineering Intern', 'MemSQL'], 'header': ['Jacob Jackson', '--', 'https://www.linkedin.com/in/jacobbfjackson/']},
    #     {'education': ['University of Toronto', 'Degree Name', 'PhD', 'Field Of Study', 'Computer Science'], 'rating': [4, 2, 6], 'experience': ['Lecturer', 'University of Toronto', 'Research Assistant', 'University of Toronto', 'Teaching Assistant', 'University of Toronto', 'Summer Intern', 'Greenplum', 'Teaching Assistant', 'University of Toronto'], 'header': ['Bogdan Simion', 'Lecturer at University of Toronto', 'https://www.linkedin.com/in/bogdan-simion-1113b27/']}
    # ]
    # with open(byte_file_path, 'ab') as f:
    #     for i in li:
    #         pickle.dump(i, f)
    #         pickle.dump('\n', f)
    # get_ratings(ratings_file_path)

    # edu = get_education(items[1])