def load(self): try: tag_recommendation_data = loadFromJson(RECOMMENDATION_DATA_DIR + 'Current_database_and_class_names.json') DATABASE = tag_recommendation_data['database'] CLASSES = tag_recommendation_data['classes'] self.cbtr = CommunityBasedTagRecommender(dataset=DATABASE, classes=CLASSES) self.cbtr.load_recommenders() except: self.cbtr = None logger.info("No computed matrices were found, recommendation system not loading for the moment (but service listening for data to come).") try: self.index_stats = loadFromJson(RECOMMENDATION_DATA_DIR + 'Current_index_stats.json') logger.info("Matrices computed out of information from %i sounds" % self.index_stats['n_sounds_in_matrix']) except Exception as e: print(e) self.index_stats = { 'n_sounds_in_matrix': 0, } try: self.index = loadFromJson(RECOMMENDATION_DATA_DIR + 'Index.json') self.index_stats['biggest_id_in_index'] = max([int(key) for key in self.index.keys()]) self.index_stats['n_sounds_in_index'] = len(self.index.keys()) except Exception as e: logger.info("Index file not present. Listening for indexing data from appservers.") self.index_stats['biggest_id_in_index'] = 0 self.index_stats['n_sounds_in_index'] = 0 self.index = dict()
def load(self): try: tag_recommendation_data = loadFromJson( RECOMMENDATION_DATA_DIR + 'Current_database_and_class_names.json') DATABASE = tag_recommendation_data['database'] CLASSES = tag_recommendation_data['classes'] self.cbtr = CommunityBasedTagRecommender(dataset=DATABASE, classes=CLASSES) self.cbtr.load_recommenders() except: self.cbtr = None logger.info( "No computed matrices were found, recommendation system not loading for the moment (but service listening for data to come)." ) try: self.index_stats = loadFromJson(RECOMMENDATION_DATA_DIR + 'Current_index_stats.json') logger.info("Matrices computed out of information from %i sounds" % self.index_stats['n_sounds_in_matrix']) except Exception, e: print e self.index_stats = { 'n_sounds_in_matrix': 0, }
def process_tag_recommendation_data(self, resources_limit=None, tag_threshold=10, line_limit=99999999999999, recompute_all_classes=False, similarity_metric="cosine"): # Process tas file and turn into association matrix and derived files database_name = self.tas_to_association_matrix( tag_threshold=tag_threshold, line_limit=line_limit) print "Loading community detector..." cd = CommunityDetector(verbose=False, PATH=RECOMMENDATION_DATA_DIR + "Classifier") print cd # Classify existing resources resources_tags = loadFromJson(RECOMMENDATION_TMP_DATA_DIR + database_name + '_RESOURCES_TAGS.json') instances_ids = resources_tags.keys() try: resource_class = loadFromJson( RECOMMENDATION_DATA_DIR + 'Classifier_classified_resources.json') except Exception, e: resource_class = dict()
def load(self): try: tag_recommendation_data = loadFromJson( tr_settings.RECOMMENDATION_DATA_DIR + 'Current_database_and_class_names.json') DATABASE = tag_recommendation_data['database'] CLASSES = tag_recommendation_data['classes'] self.cbtr = CommunityBasedTagRecommender(dataset=DATABASE, classes=CLASSES) self.cbtr.load_recommenders() except: self.cbtr = None logger.info("No computed matrices were found, recommendation system not loading for the moment (but service listening for data to come).") try: self.index_stats = loadFromJson(tr_settings.RECOMMENDATION_DATA_DIR + 'Current_index_stats.json') logger.info("Matrices computed out of information from %i sounds" % self.index_stats['n_sounds_in_matrix']) except Exception as e: print(e) self.index_stats = { 'n_sounds_in_matrix': 0, } try: self.index = loadFromJson(tr_settings.RECOMMENDATION_DATA_DIR + 'Index.json') self.index_stats['biggest_id_in_index'] = max([int(key) for key in self.index.keys()]) self.index_stats['n_sounds_in_index'] = len(self.index.keys()) except Exception as e: logger.info("Index file not present. Listening for indexing data from appservers.") self.index_stats['biggest_id_in_index'] = 0 self.index_stats['n_sounds_in_index'] = 0 self.index = dict()
def __init__(self, verbose=True, classifier_type="svm", PATH=None, INIT_METHOD="ZeroInit", selected_instances=None ): self.verbose = verbose self.n_training_instances = 0 self.clf_type = classifier_type self.class_name_ids = dict() self.init_method = INIT_METHOD self.selected_instances = selected_instances if not os.path.exists(PATH + ".pkl") or \ not os.path.exists(PATH + "_meta.json") or \ not os.path.exists(RECOMMENDATION_DATA_DIR + 'Classifier_TAG_NAMES.npy'): raise Exception("Classifier not existing in classifiers folder.") self.clf = joblib.load(PATH + ".pkl") meta = loadFromJson(PATH + "_meta.json") self.clf_type = meta['clf_type'] self.class_name_ids = meta['class_name_ids'] self.n_training_instances = meta['n_training_instances'] self.tag_names = load(RECOMMENDATION_DATA_DIR + 'Classifier_TAG_NAMES.npy')
def load(self): try: tag_recommendation_data = loadFromJson(RECOMMENDATION_DATA_DIR + 'Current_database_and_class_names.json') DATABASE = tag_recommendation_data['database'] CLASSES = tag_recommendation_data['classes'] self.cbtr = CommunityBasedTagRecommender(dataset=DATABASE, classes=CLASSES) self.cbtr.load_recommenders() except: self.cbtr = None logger.info("No computed matrices were found, recommendation system not loading for the moment (but service listening for data to come).") try: self.index_stats = loadFromJson(RECOMMENDATION_DATA_DIR + 'Current_index_stats.json') logger.info("Matrices computed out of information from %i sounds" % self.index_stats['n_sounds_in_matrix']) except Exception, e: print e self.index_stats = { 'n_sounds_in_matrix': 0, }
def process_tag_recommendation_data(self, resources_limit=None, tag_threshold=10, line_limit=99999999999999, recompute_all_classes=False, similarity_metric="cosine"): # Process tas file and turn into association matrix and derived files database_name = self.tas_to_association_matrix(tag_threshold=tag_threshold, line_limit=line_limit) print "Loading community detector..." cd = CommunityDetector(verbose=False, PATH=RECOMMENDATION_DATA_DIR + "Classifier") print cd # Classify existing resources resources_tags = loadFromJson(RECOMMENDATION_TMP_DATA_DIR + database_name + '_RESOURCES_TAGS.json') instances_ids = resources_tags.keys() try: resource_class = loadFromJson(RECOMMENDATION_DATA_DIR + 'Classifier_classified_resources.json') except Exception, e: resource_class = dict()
def cutTermini(config, setting): cutSetting = config.getSetting(setting) inputPdb = config.getInputFile(setting, "pdb") cutlog = config.getInputFile(setting, "cutlog") cutPdb = config.getOutputFile(setting, "out") if config.getSetting(setting)['verbose']: print("Cut Termini from " + inputPdb + " and output to " + cutPdb) if not config.getSetting(setting)["dryRun"]: log = utils.loadFromJson(cutlog) residues = log['looseTerminiFront'] + log['looseTerminiBack'] pdblines = utils.readFileToList(inputPdb) utils.cutTerminiAndWriteToPdb(residues,pdblines, cutPdb)
def tas_to_association_matrix(self, tag_threshold=0, line_limit=1000000000): index = loadFromJson(RECOMMENDATION_DATA_DIR + "Index.json") # Get tags from file ts = [] idx = 0 n_original_associations = 0 sound_ids = [] if self.verbose: print "Reading index file (%i entries)..." % len(index.items()), for sid, tags in index.items(): ts += tags n_original_associations += len(tags) sound_ids.append(sid) idx += 1 if idx > line_limit: break stats = { 'n_sounds_in_matrix': len(sound_ids), #'biggest_id': max([int(sid) for sid in sound_ids]) } saveToJson(RECOMMENDATION_TMP_DATA_DIR + 'Current_index_stats.json', stats) if self.verbose: print "done!" # Compute tag ocurrences after loading the file tag_occurrences = dict() unique_ts = list(set(ts)) for id, t in enumerate(unique_ts): tag_occurrences[t] = ts.count(t) if self.verbose: sys.stdout.write("\rComputing tag occurrences %.2f%%" % (float(100 * (id + 1)) / len(unique_ts))) sys.stdout.flush() print "" tags = [] tags_ids = [] for id, t in enumerate(unique_ts): if tag_occurrences[t] >= tag_threshold: tags.append(t) tags_ids.append(id) if self.verbose: sys.stdout.write("\rFiltering tags %.2f%%" % (float(100 * (id + 1)) / len(unique_ts))) sys.stdout.flush() nTags = len(tags) if self.verbose: print "" print "\tOriginal number of tags: " + str(len(unique_ts)) print "\tTags after filtering: " + str(nTags) # Generate resource-tags dictionary only with filtered tags if self.verbose: print "Reading file for resources...", sys.stdout.flush() res_tags = {} res_user = {} res_tags_no_filt = {} idx = 0 n_filtered_associations = 0 for sid, stags in index.items(): resource = sid user = None assigned_tags = stags assigned_tags_filt = list( set(assigned_tags).intersection(set(tags))) res_tags_no_filt[resource] = assigned_tags res_user[resource] = user if len(assigned_tags_filt) > 0: res_tags[resource] = assigned_tags_filt n_filtered_associations += len(assigned_tags_filt) idx += 1 if idx > line_limit: break resources = res_tags.keys() nResources = len(resources) resources_ids = range(0, nResources) if self.verbose: print "done!" # Generate assocoation matrix if self.verbose: print "\tOriginal number of associations: " + str( n_original_associations) print "\tAssociations after filtering: " + str( n_filtered_associations) if self.verbose: print 'Creating empty array of ' + str(nResources) + ' x ' + str( nTags) + '...', M = spmatrix.ll_mat(nResources, nTags) if self.verbose: print 'done!' done = 0 for r_id in resources: for t in res_tags[r_id]: M[resources.index(r_id), tags.index(t)] = 1 done += 1 if self.verbose: sys.stdout.write( "\rGenerating association matrix %.2f%%" % (float(100 * done) / n_filtered_associations)) sys.stdout.flush() if self.verbose: print "" # Save data if self.verbose: print "Saving association matrix, resource ids, tag ids and tag names" filename = "FS%.4i%.2i%.2i" % (datetime.today().year, datetime.today().month, datetime.today().day) M.export_mtx(RECOMMENDATION_TMP_DATA_DIR + filename + '_ASSOCIATION_MATRIX.mtx') save(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCE_IDS.npy', resources) save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_IDS.npy', tags_ids) save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_NAMES.npy', tags) saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_TAGS.json', res_tags, verbose=self.verbose) #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_TAGS_NO_FILTER.json',res_tags_no_filt, verbose = self.verbose) #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_USER.json',res_user, verbose = self.verbose) return filename
def tas_to_association_matrix(self, tag_threshold=0, line_limit=1000000000): index = loadFromJson(RECOMMENDATION_DATA_DIR + "Index.json") # Get tags from file ts = [] idx = 0 n_original_associations = 0 sound_ids = [] if self.verbose: print "Reading index file (%i entries)..." % len(index.items()), for sid, tags in index.items(): ts += tags n_original_associations += len(tags) sound_ids.append(sid) idx += 1 if idx > line_limit: break stats = { 'n_sounds_in_matrix': len(sound_ids), #'biggest_id': max([int(sid) for sid in sound_ids]) } saveToJson(RECOMMENDATION_TMP_DATA_DIR + 'Current_index_stats.json', stats) if self.verbose: print "done!" # Compute tag ocurrences after loading the file tag_occurrences = dict() unique_ts = list(set(ts)) for id, t in enumerate(unique_ts): tag_occurrences[t] = ts.count(t) if self.verbose: sys.stdout.write("\rComputing tag occurrences %.2f%%"%(float(100*(id+1))/len(unique_ts))) sys.stdout.flush() print "" tags = [] tags_ids = [] for id, t in enumerate(unique_ts): if tag_occurrences[t] >= tag_threshold: tags.append(t) tags_ids.append(id) if self.verbose: sys.stdout.write("\rFiltering tags %.2f%%"%(float(100*(id+1))/len(unique_ts))) sys.stdout.flush() nTags = len(tags) if self.verbose: print "" print "\tOriginal number of tags: " + str(len(unique_ts)) print "\tTags after filtering: " + str(nTags) # Generate resource-tags dictionary only with filtered tags if self.verbose: print "Reading file for resources...", sys.stdout.flush() res_tags = {} res_user = {} res_tags_no_filt = {} idx = 0 n_filtered_associations = 0 for sid, stags in index.items(): resource = sid user = None assigned_tags = stags assigned_tags_filt = list(set(assigned_tags).intersection(set(tags))) res_tags_no_filt[resource] = assigned_tags res_user[resource] = user if len(assigned_tags_filt) > 0: res_tags[resource] = assigned_tags_filt n_filtered_associations += len(assigned_tags_filt) idx += 1 if idx > line_limit: break resources = res_tags.keys() nResources = len(resources) resources_ids = range(0,nResources) if self.verbose: print "done!" # Generate assocoation matrix if self.verbose: print "\tOriginal number of associations: " + str(n_original_associations) print "\tAssociations after filtering: " + str(n_filtered_associations) if self.verbose: print 'Creating empty array of ' + str(nResources) + ' x ' + str(nTags) + '...', M = spmatrix.ll_mat(nResources, nTags) if self.verbose: print 'done!' done = 0 for r_id in resources: for t in res_tags[r_id]: M[resources.index(r_id),tags.index(t)] = 1 done += 1 if self.verbose: sys.stdout.write("\rGenerating association matrix %.2f%%" % (float(100*done)/n_filtered_associations)) sys.stdout.flush() if self.verbose: print "" # Save data if self.verbose: print "Saving association matrix, resource ids, tag ids and tag names" filename = "FS%.4i%.2i%.2i" % (datetime.today().year, datetime.today().month, datetime.today().day) M.export_mtx(RECOMMENDATION_TMP_DATA_DIR + filename + '_ASSOCIATION_MATRIX.mtx') save(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCE_IDS.npy',resources) save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_IDS.npy',tags_ids) save(RECOMMENDATION_TMP_DATA_DIR + filename + '_TAG_NAMES.npy',tags) saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_TAGS.json',res_tags, verbose = self.verbose) #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_TAGS_NO_FILTER.json',res_tags_no_filt, verbose = self.verbose) #saveToJson(RECOMMENDATION_TMP_DATA_DIR + filename + '_RESOURCES_USER.json',res_user, verbose = self.verbose) return filename
def process_tag_recommendation_data(self, resources_limit=None, tag_threshold=10, line_limit=99999999999999, recompute_all_classes=False, similarity_metric="cosine"): # Process tas file and turn into association matrix and derived files database_name = self.tas_to_association_matrix(tag_threshold=tag_threshold, line_limit=line_limit) print "Loading community detector..." cd = CommunityDetector(verbose=False, PATH=RECOMMENDATION_DATA_DIR + "Classifier") print cd # Classify existing resources resources_tags = loadFromJson(RECOMMENDATION_TMP_DATA_DIR + database_name + '_RESOURCES_TAGS.json') instances_ids = resources_tags.keys() try: resource_class = loadFromJson(RECOMMENDATION_DATA_DIR + 'Classifier_classified_resources.json') except Exception as e: resource_class = dict() for count, id in enumerate(instances_ids): if not recompute_all_classes: if id not in resource_class: resource_class[id] = cd.detectCommunity(input_tags=resources_tags[id]) else: resource_class[id] = cd.detectCommunity(input_tags=resources_tags[id]) if self.verbose: sys.stdout.write("\rClassifying resources... %.2f%%"%(float(100*(count+1))/len(instances_ids))) sys.stdout.flush() print "" saveToJson(RECOMMENDATION_DATA_DIR + 'Classifier_classified_resources.json', resource_class) print "" print "\nComputing data for general recommender..." self.association_matrix_to_similarity_matrix( dataset=database_name, training_set=instances_ids[0:resources_limit], save_sim=True, is_general_recommender=True, metric=similarity_metric, ) print "\nComputing data for class recommenders..." instance_id_class = [] distinct_classes = [] for count, instance_id in enumerate(instances_ids): class_id = resource_class[instance_id] instance_id_class.append([instance_id, class_id]) if class_id not in distinct_classes: distinct_classes.append(class_id) print distinct_classes for collection_id in distinct_classes: print "\nComputing recommender for collection %s..." % collection_id # All resources from the training set classified as the selected category # (instead of all manually labeled) training_ids = [] for instance in instance_id_class: if instance[1] == collection_id: training_ids.append(instance[0]) # Add limit training_ids = training_ids[0:resources_limit] if len(training_ids) < 1: raise Exception("Too less training ids for collection %s" % collection_id) self.association_matrix_to_similarity_matrix( dataset=database_name, training_set=training_ids, save_sim=True, out_name_prefix=collection_id, is_general_recommender=False, metric=similarity_metric, )