示例#1
0
 def __init__(self):
     print 'Initializing Recommender..'
     directory_name = rm.CACHE
     self.data_retriever = DataRetriever(directory_name)
     self.project_data = self.data_retriever.parseProjectData()
     self.user_data, self.user_follower_map = self.data_retriever.parseUserFollowers()
     self.language_proj = defaultdict()   
示例#2
0
class Recommender():
    """Initialize the recommender"""
    def __init__(self):
        print 'Initializing Recommender..'
        directory_name = rm.CACHE
        self.data_retriever = DataRetriever(directory_name)
        self.project_data = self.data_retriever.parseProjectData()
        self.user_data, self.user_follower_map = self.data_retriever.parseUserFollowers()
        self.language_proj = defaultdict()   

    def get_languages(self):
        lang_dict = {}
        
        for lang in self.language_proj.keys():
            _lang = lang.replace(' ','$')
            lang_dict[_lang] = lang
        return lang_dict        

    def get_aoi(self):
        return self.categories
    
    """Get different scores for each project"""
    def build_project_features(self):
        try:    
            with open(rm.NB_PROB, 'rb') as f:
                print "Reading probabilities from:", rm.NB_PROB
                self.project_vector = pickle.load(f)
                self.categories = pickle.load(f)

                print 'done.'
                print '#Projects:', len(self.project_vector)
                print '#Categories:', len(self.categories)
        except:
            print "Generating a new Naive Base classifier"
            self.project_vector_builder = ProjectVectorBuilder(self.project_data)
            self.project_vector = self.project_vector_builder.build_projects_vector()
            self.categories = list(self.project_vector_builder.nb.clf.classes_)
            with open(rm.NB_PROB, 'wb') as f:
                pickle.dump(self.project_vector, f)
            with open(rm.NB_PROB, 'ab') as f:
                pickle.dump(self.categories, f)

        self.user_ranking = pagerank(self.user_data)
        with open(os.path.join(rm.CACHE, 'lang_to_projects.p'), 'rb') as f:
          self.language_proj = pickle.load(f)
        
        with open(os.path.join(rm.CACHE, 'new_LOC.p'),'rb') as f:
          self.difficulty_score = pickle.load(f)
                 
    def recommend_projects(self, languages, area_interest, difficulty): 
        print "Calling recommender"
        projects = set()
        #Filter based on languages
        for language in languages:
            projects = projects.union(self.language_proj[language]) 
        
        similar_projects = []
        for project in projects:
            if project not in self.project_vector:   continue

            if self.project_vector[project]['category'] in area_interest:
                project_desc = self.project_vector[project]
                project_desc['html_url'] = self.project_data[project]['html_url']
                project_desc['full_name'] = self.project_data[project]['full_name']
                similar_projects.append(project_desc)
        
        sorted_similar_projects = sorted(similar_projects, key=lambda k: k['prob'], reverse=True) 
        #pp.pprint(sorted_similar_projects)
        zipped = map(list, zip(*self.user_ranking))
        userLists = zipped[0]
        PRs = zipped[1]
        sortedProjsLength = len(sorted_similar_projects)
        for i in range(0,len(sorted_similar_projects)):
          proj = sorted_similar_projects[i]
          project = self.project_data[proj[u'full_name']]
          owner = project[u'owner']
          if owner[u'login'] in userLists:
            userIndex = userLists.index(owner[u'login'])
            sorted_similar_projects[i]['page_rank_of_owner'] = PRs[userIndex]
            sorted_similar_projects[i]['owner'] = owner[u'login']
            #sorted_similar_projects[i]['contributors'] = self.project_data[proj['full_name']]['contributors'][0]['login']
            if len(self.project_data[proj['full_name']]['contributors']) >=1:
              sorted_similar_projects[i]['contributors'] = self.project_data[proj['full_name']]['contributors'][0]['login']
              sorted_similar_projects[i]['contributors_url'] = self.project_data[proj['full_name']]['contributors'][0]['html_url']
            else:
              sorted_similar_projects[i]['contributors'] = ''
              sorted_similar_projects[i]['contributors_url'] ='' 
          else:
            sorted_similar_projects[i]['page_rank_of_owner'] = 0
            sorted_similar_projects[i]['owner'] = owner[u'login']
            if len(self.project_data[proj['full_name']]['contributors']) >=1:
              sorted_similar_projects[i]['contributors'] = self.project_data[proj['full_name']]['contributors'][0]['login']
              sorted_similar_projects[i]['contributors_url'] = self.project_data[proj['full_name']]['contributors'][0]['html_url']
            else:
              sorted_similar_projects[i]['contributors'] = ''
              sorted_similar_projects[i]['contributors_url'] = ''

        # sort the sorted_similar_projects based on the key 'page_rank_of_owner' value
        # have the contributors tag with the first contributor for the server side handling


        #"""
        if len(sorted_similar_projects) > 10:
          firstListToSort = sorted_similar_projects[0:sortedProjsLength/2]
          secListToSort = sorted_similar_projects[sortedProjsLength/2 + 1 : sortedProjsLength*4/5 ]
          thirListToSort = sorted_similar_projects[sortedProjsLength*4/5 + 1 : ]
          #"""
         
          sorted1 =  sorted(firstListToSort, key=lambda k: k['page_rank_of_owner'], reverse=True) 
          sorted2 =  sorted(secListToSort, key=lambda k: k['page_rank_of_owner'], reverse=True) 
          sorted3 =  sorted(thirListToSort, key=lambda k: k['page_rank_of_owner'], reverse=True) 
         
          #print len(sorted1)
          #print len(sorted2)
          #print len(sorted3)
         
          sorted1.extend(sorted2)
          sorted3.extend(sorted1)
          #pp.pprint(sorted3)
         
          #print 'lenght after merging all: ',len(sorted3)

          return sorted3

        return sorted_similar_projects