def rating_prompt(original_data, filename='linkedin_rating_backend'): """ (Couldn't use the original rating prompt because dealing with dictionaries) (Really only used once in the beginning...) Rating Prompt allowing me to originally rate people for recommender system... Parameter: -original_data: list of parsed and clean dictionaries -filename: the file where the dictionaries are located rating = [edu, exp, total] """ li = [] for i in original_data: print(i) edu = int( input('Enter a rating of education (based on title, 1 to 5): ')) exp = int( input('Enter a rating of experience (based on title, 1 to 5): ')) total = edu + exp i['rating'] = [edu, exp, total] li.append(i) # new dictionaries f = FileProcessor(filename, 'train') f.eraseFile() # erase_file(filename) with open(filename, 'ab') as b: for i in li: pickle.dump(i, b) pickle.dump('\n', b) print('Done!')
def get_ratings(filename): """ Return's a list of ratings Parameters: -filename: name of file or path where the rated items will be stored """ f = FileProcessor(filename, 'train') lines = f.readByteFile() lines = [line for line in lines if line != '\n' and line != None] return [line.get('rating') for line in lines]
def description_duplicate_url_checker(url): """ Check's if urls in raw_url file are not already in the link_people_description file """ f = FileProcessor( '/Users/Rahul/Desktop/Main/Side_projects/project_2/lifeline/Scripts/link_new/files/linkedin_people_description', 'train') items = f.readByteFile() li = [i for i in items if i != '\n' and i != None] for i in li: link = i.get('header')[-1] if url == link: return True return False
def get_more_urls(org_file, dest_file, num=0): """ Transfer's all url's from the raw_url file (same as the js file) to a new file with 10+ viewed people Deletes all the url's transferfed from the original file Parameters: -Org_file: Initial Raw Url File -Dest_File: New File you want this to be transferred -Num default will be to get all the urls in the original raw file """ li = duplicate( '/Users/Rahul/Desktop/Main/Side_projects/project_2/lifeline/Scripts/link_new/files/linkedin_raw_url', '/Users/Rahul/Desktop/Main/Side_projects/project_2/lifeline/Scripts/link_new/files/linkedin_dest_url', 'txt') f = FileProcessor(org_file, 'train') lines = f.readFile() lines = [line.replace('\n', '') for line in lines if line not in li] f.eraseFile()
errCount += 1 print(data[i], x, label[i]) return (errCount / len(bagOfWords)) * 100 def classification(test_data, test_bagOfWords, original_data, original_labels, original_bagOfWords, k=3): """ kNN Model Based Classifier for test data (actual data) """ for i in range(len(test_bagOfWords)): x = classify(np.array(test_bagOfWords[i]), np.array(original_bagOfWords), original_labels, k) print(test_data[i], x) if __name__ == '__main__': c = CosineSimilarity() t = TextProcessing() f = FileProcessor('experience_classification', 'train') data, label = f.cleanFile() feature_in_category = t.get_feature_in_category(data, label) local_neighbour = t.get_local_neighbours(feature_in_category) global_neighbour = t.get_global_neighbours(feature_in_category) global_words = [global_neighbour[i][0] for i in global_neighbour] sorted_local_neighbours = t.get_local_neighbours_sorted(feature_in_category) # for i in sorted_local_neighbours: # print(sorted_local_neighbours[i]) ## Training Data: 4-6% revised_data = model_construction(data, global_words, sorted_local_neighbours) # for i in revised_data: # print(i) vocabSet = c.vocabSet(revised_data) bagOfWords = [c.bag_of_words(vocabSet, i) for i in revised_data]
def duplicate(filename1, filename2, type_file): """ Return's a list of duplicates in byte file """ # TODO: This code is terrible! if type_file == 'byte': items_file1 = FileProcessor(filename1, 'train') items_file2 = FileProcessor(filename2, 'train') items_file1 = items_file1.readByteFile() items_file2 = items_file2.readByteFile() # items_file1 = [item for item in items_file1 if item != '\n' and item != None] # items_file2 = [item for item in items_file2 if item != '\n' and item != None] return [item for item in items_file2 if item in items_file1] # return duplicate_list elif type_file == 'txt': items_file1 = FileProcessor(filename1, 'train') items_file2 = FileProcessor(filename2, 'train') items_file1 = items_file1.readFile() items_file2 = items_file2.readFile() # items_file1 = [item for item in items_file1 if item != '\n' and item != None] # items_file2 = [item for item in items_file2 if item != '\n' and item != None] return [item for item in items_file2 if item in items_file1] # return duplicate_list else: return 'Only byte or txt'
""" """ pass def recommend(): """ """ pass if __name__ == '__main__': t = TextProcessing() c = CosineSimilarity() byte_file_path = '/Users/Rahul/Desktop/Main/Side_projects/project_2/lifeline/Scripts/link_new/files/linkedin_people_description' byte_file = FileProcessor(byte_file_path, 'train') items = byte_file.readByteFile() items = [item for item in items if item != '\n' and item != None] # big_list = [] # education + experience list # for i in items: # print(i) # edu = get_education(i) # exp = get_experience(i) # big_list.append(edu) # big_list.append(exp) # # vocabSet = c.vocabSet(big_list) # wordVectors = [c.bag_of_words(vocabSet, i) for i in big_list] ratings_file_path = '/Users/Rahul/Desktop/Main/Side_projects/project_2/lifeline/Scripts/link_new/files/linkedin_rating_backend'
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle. # plt.tight_layout() plt.show() def multiplePieGraph(): """ Show's all the Pie Graph's at once """ pass if __name__ == '__main__': t = TextProcessing() byte_file = FileProcessor( '/Users/Rahul/Desktop/Main/Side_projects/project_2/lifeline/Scripts/link_new/files/linkedin_people_description', 'train') items = byte_file.readByteFile() items = [item for item in items if item != '\n' and item != None] # uni_name, degree_name = get_education_data(items) # experience, company_name = get_experience_data(items) # uni_name, degree_name = clean_data(uni_name), clean_data(degree_name) # experience, company_name = clean_data(experience), clean_data(company_name) # uni_name_count, degree_name_count, experience_count, company_name_count = count(uni_name), count( # degree_name), count(experience), count(company_name) # # for i in degree_name: # split the dual degree # if 'and' in i:
def run(): settings, args = process_command_line(None) try: config_rows = read_config_file_as_table(settings.config_file) if settings.unified_config_file: unified_config_rows = read_config_file_as_table( settings.unified_config_file) except ConfigException as ex: print ex return 1 interaction_type_to_condition_files_dictionary = get_condition_files( config_rows) signif_dictionary = generate_signif_per_conditions_dictionary(config_rows) # max_lib_dictionary = generate_max_lib_per_condition_dictionary(config_rows) # Mark whether option was set categories_lists = {} # categories_lists = {DIV_OPTION_SIGNIF: signif_dictionary} if settings.signif: categories_lists[DIV_OPTION_SIGNIF] = signif_dictionary if settings.known_targets_file: known_pair_list = generate_known_pairs_list( settings.known_targets_file) categories_lists[DIV_OPTION_KNOWN_TARGETS] = known_pair_list if settings.no_binding_file: binding_pair_list = generate_binding_pairs_list( settings.no_binding_file) categories_lists[DIV_OPTION_BINDING] = binding_pair_list if settings.questionable_file: questionable_pair_list = generate_questionable_pairs_list( settings.questionable_file) categories_lists[DIV_OPTION_QUESTIONABLE] = questionable_pair_list interaction_to_condition_totals = {} interaction_to_condition_libs = {} interaction_to_condition_signif_libs = {} # Go over the files and make the total count of interactions per interaction type for interaction_type, condition_files_dictionary in interaction_type_to_condition_files_dictionary.items( ): interaction_to_condition_totals[interaction_type] = {} interaction_to_condition_libs[interaction_type] = {} interaction_to_condition_signif_libs[interaction_type] = {} for condition_name, file_list in condition_files_dictionary.items(): interaction_to_condition_totals[interaction_type][ condition_name] = {} interaction_to_condition_libs[interaction_type][ condition_name] = {} interaction_to_condition_signif_libs[interaction_type][ condition_name] = {} for file_path in file_list: fp = FileProcessor( file_path, { "read_count": CountHandler(), "signif_lib_count": MaxLibHandler() }) fp.process() interactions_dct = fp.row_handlers["read_count"].dct max_signif_lib_dct = fp.row_handlers["signif_lib_count"].dct # Handle total count for name, count in interactions_dct.items(): # Add the name for the first time if name not in interaction_to_condition_totals[ interaction_type][condition_name]: interaction_to_condition_totals[interaction_type][ condition_name][name] = 0 # Increase the count interaction_to_condition_totals[interaction_type][ condition_name][name] += count # Handle signif libs for name, lib_count in max_signif_lib_dct.items(): # Add the name for the first time if name not in interaction_to_condition_signif_libs[ interaction_type][condition_name]: interaction_to_condition_signif_libs[interaction_type][ condition_name][name] = 0 # Set the maximal lib interaction_to_condition_signif_libs[interaction_type][condition_name][name] = \ max(interaction_to_condition_signif_libs[interaction_type][condition_name][name], lib_count) # Handle total libs for name, lib_count in interactions_dct.items(): # Add the name for the first time if name not in interaction_to_condition_libs[ interaction_type][condition_name]: interaction_to_condition_libs[interaction_type][ condition_name][name] = 0 # Set the maximal lib interaction_to_condition_libs[interaction_type][ condition_name][name] += 1 # Only if unified config file was inserted if settings.unified_config_file: unified_interaction_type_to_condition_files_dictionary = get_condition_files( unified_config_rows) unified_interaction_to_condition_totals = {} unified_interaction_to_condition_libs = {} for interaction_type, condition_files_dictionary in unified_interaction_type_to_condition_files_dictionary.items( ): unified_interaction_to_condition_totals[interaction_type] = {} unified_interaction_to_condition_libs[interaction_type] = {} for condition_name, file_list in condition_files_dictionary.items( ): unified_interaction_to_condition_totals[interaction_type][ condition_name] = {} for file_path in file_list: fp = FileProcessor( file_path, { "lib_count": UnifiedLibHandler( interaction_to_condition_libs[interaction_type] [condition_name]) }) fp.process() unified_interaction_to_condition_libs[interaction_type][condition_name] = \ fp.row_handlers["lib_count"].dct # Generate group tables per condition for interaction_type, condition_totals in interaction_to_condition_totals.items( ): perms = [] # Generate all permutations for the flags for i in range(len(categories_lists) + 1): values = [True] * i + [False] * (len(categories_lists) - i) perms += list(set(itertools.permutations(values))) # Generate the different groups according to t for permutation in perms: categories_results = { category: permutation[i] for i, category in enumerate(categories_lists.keys()) } generate_table( interaction_type, condition_totals, interaction_to_condition_libs[interaction_type], interaction_to_condition_signif_libs[interaction_type], categories_results, categories_lists) # Generate unified tables if settings.unified_config_file: for interaction_type, condition_to_libs in unified_interaction_to_condition_libs.items( ): for condition_name, names_to_libs in condition_to_libs.items(): with open( "output/unified_%s_%s.table" % (interaction_type, condition_name), "wb") as fl: writer = csv.writer(fl, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) writer.writerow(["libs", "known", "unknown"]) libs_max = max( set([int(val) for key, val in names_to_libs.items()])) for i in range(libs_max + 1): writer.writerow([ i, sum([ 1 for name, val in names_to_libs.items() if val == i and is_known( name, categories_lists[DIV_OPTION_KNOWN_TARGETS]) ]), sum([ 1 for name, val in names_to_libs.items() if val == i and not is_known( name, categories_lists[DIV_OPTION_KNOWN_TARGETS]) ]) ]) return 0
def run(): settings, args = process_command_line(None) try: config_rows = read_config_file_as_table(settings.config_file) if settings.unified_config_file: unified_config_rows = read_config_file_as_table(settings.unified_config_file) except ConfigException as ex: print ex return 1 interaction_type_to_condition_files_dictionary = get_condition_files(config_rows) signif_dictionary = generate_signif_per_conditions_dictionary(config_rows) # max_lib_dictionary = generate_max_lib_per_condition_dictionary(config_rows) # Mark whether option was set categories_lists = {} # categories_lists = {DIV_OPTION_SIGNIF: signif_dictionary} if settings.signif: categories_lists[DIV_OPTION_SIGNIF] = signif_dictionary if settings.known_targets_file: known_pair_list = generate_known_pairs_list(settings.known_targets_file) categories_lists[DIV_OPTION_KNOWN_TARGETS] = known_pair_list if settings.no_binding_file: binding_pair_list = generate_binding_pairs_list(settings.no_binding_file) categories_lists[DIV_OPTION_BINDING] = binding_pair_list if settings.questionable_file: questionable_pair_list = generate_questionable_pairs_list(settings.questionable_file) categories_lists[DIV_OPTION_QUESTIONABLE] = questionable_pair_list interaction_to_condition_totals = {} interaction_to_condition_libs = {} interaction_to_condition_signif_libs = {} # Go over the files and make the total count of interactions per interaction type for interaction_type, condition_files_dictionary in interaction_type_to_condition_files_dictionary.items(): interaction_to_condition_totals[interaction_type] = {} interaction_to_condition_libs[interaction_type] = {} interaction_to_condition_signif_libs[interaction_type] = {} for condition_name, file_list in condition_files_dictionary.items(): interaction_to_condition_totals[interaction_type][condition_name] = {} interaction_to_condition_libs[interaction_type][condition_name] = {} interaction_to_condition_signif_libs[interaction_type][condition_name] = {} for file_path in file_list: fp = FileProcessor(file_path, {"read_count": CountHandler(), "signif_lib_count": MaxLibHandler()}) fp.process() interactions_dct = fp.row_handlers["read_count"].dct max_signif_lib_dct = fp.row_handlers["signif_lib_count"].dct # Handle total count for name, count in interactions_dct.items(): # Add the name for the first time if name not in interaction_to_condition_totals[interaction_type][condition_name]: interaction_to_condition_totals[interaction_type][condition_name][name] = 0 # Increase the count interaction_to_condition_totals[interaction_type][condition_name][name] += count # Handle signif libs for name, lib_count in max_signif_lib_dct.items(): # Add the name for the first time if name not in interaction_to_condition_signif_libs[interaction_type][condition_name]: interaction_to_condition_signif_libs[interaction_type][condition_name][name] = 0 # Set the maximal lib interaction_to_condition_signif_libs[interaction_type][condition_name][name] = \ max(interaction_to_condition_signif_libs[interaction_type][condition_name][name], lib_count) # Handle total libs for name, lib_count in interactions_dct.items(): # Add the name for the first time if name not in interaction_to_condition_libs[interaction_type][condition_name]: interaction_to_condition_libs[interaction_type][condition_name][name] = 0 # Set the maximal lib interaction_to_condition_libs[interaction_type][condition_name][name] += 1 # Only if unified config file was inserted if settings.unified_config_file: unified_interaction_type_to_condition_files_dictionary = get_condition_files(unified_config_rows) unified_interaction_to_condition_totals = {} unified_interaction_to_condition_libs = {} for interaction_type, condition_files_dictionary in unified_interaction_type_to_condition_files_dictionary.items(): unified_interaction_to_condition_totals[interaction_type] = {} unified_interaction_to_condition_libs[interaction_type] = {} for condition_name, file_list in condition_files_dictionary.items(): unified_interaction_to_condition_totals[interaction_type][condition_name] = {} for file_path in file_list: fp = FileProcessor(file_path, {"lib_count": UnifiedLibHandler(interaction_to_condition_libs[interaction_type][condition_name])}) fp.process() unified_interaction_to_condition_libs[interaction_type][condition_name] = \ fp.row_handlers["lib_count"].dct # Generate group tables per condition for interaction_type, condition_totals in interaction_to_condition_totals.items(): perms = [] # Generate all permutations for the flags for i in range(len(categories_lists) + 1): values = [True] * i + [False] * (len(categories_lists) - i) perms += list(set(itertools.permutations(values))) # Generate the different groups according to t for permutation in perms: categories_results = {category: permutation[i] for i, category in enumerate(categories_lists.keys())} generate_table(interaction_type, condition_totals, interaction_to_condition_libs[interaction_type], interaction_to_condition_signif_libs[interaction_type], categories_results, categories_lists) # Generate unified tables if settings.unified_config_file: for interaction_type, condition_to_libs in unified_interaction_to_condition_libs.items(): for condition_name, names_to_libs in condition_to_libs.items(): with open("output/unified_%s_%s.table" % (interaction_type, condition_name), "wb") as fl: writer = csv.writer(fl, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) writer.writerow(["libs", "known", "unknown"]) libs_max = max(set([int(val) for key, val in names_to_libs.items()])) for i in range(libs_max + 1): writer.writerow([i, sum([1 for name, val in names_to_libs.items() if val == i and is_known(name, categories_lists[DIV_OPTION_KNOWN_TARGETS])]), sum([1 for name, val in names_to_libs.items() if val == i and not is_known(name, categories_lists[DIV_OPTION_KNOWN_TARGETS])])]) return 0