def train_model(item): x, y = [], [] if len(item.papers) > 0 and ClickCount.objects(search_item=item).count() > 0: try: click_counts = ClickCount.objects(search_item=item) h = {} for click_count in click_counts: h[str(click_count.paper.id)] = click_count.count for paper in item.papers: if str(paper.id) in h: count = h[str(paper.id)] else: count = 0 x.append(vectorize_paper(paper)) y.append(count) regressor = tree.DecisionTreeRegressor() regressor.fit(x, y) return regressor except: print(x) print(y) else: return None
def get_scores(self): try: item = SearchItem.objects(keyword=self.keyword).get() if item.model: regressor = pickle.loads(item.model) papers = item.papers x = [vectorize_paper(paper) for paper in papers] y = regressor.predict(x) # return papers return itemgetter(*[t[0] for t in sorted(enumerate(y), key=lambda i: i[1], reverse=True)])(papers) except Exception as e: logging.debug(e) return None # def add_missing_info(self): # self.add_journal_if_self() # # TODO jounal if # def add_journal_if_self(self): # for k,v in self.papers.items(): # if 'Journal' not in v or not v['Journal']: # v["Journal_IF"] = 0 # continue # try: # stripped_journal_name = re.sub('[\W_]+', '', v["Journal"].upper()) # v["Journal_IF"] = Journal.get(name==stripped_journal_name).impact_factor # except Exception as e: # try: # if len(stripped_journal_name) >= 16: # v["Journal_IF"] = Journal.get( # name.startswith(stripped_journal_name[:16])).impact_factor # if len(stripped_journal_name) >= 12: # v["Journal_IF"] = Journal.get(name.startswith(stripped_journal_name[:12])).impact_factor # elif len(stripped_journal_name) >= 8: # v["Journal_IF"] = Journal.get(name.startswith(stripped_journal_name[:8])).impact_factor # elif len(stripped_journal_name) >= 4: # v["Journal_IF"] = Journal.get(name.startswith(stripped_journal_name[:4])).impact_factor # else: # v["Journal_IF"] = 0 # except Exception as e: # v["Journal_IF"] = 0 # @staticmethod # def add_journal_if(paper_list): # for paper in paper_list: # if 'Journal' not in paper or not paper['Journal']: # paper["Journal_IF"] = 0 # continue # try: # stripped_journal_name = re.sub('[\W_]+', '', paper["Journal"].upper()) # paper["Journal_IF"] = Journal.get(Journal.title==stripped_journal_name).impact_factor # except DoesNotExist: # try: # if len(stripped_journal_name) >= 16: # paper["Journal_IF"] = Journal.get( # Journal.title.startswith(stripped_journal_name[:16])).impact_factor # if len(stripped_journal_name) >= 12: # paper["Journal_IF"] = Journal.get(Journal.title.startswith(stripped_journal_name[:12])).impact_factor # elif len(stripped_journal_name) >= 8: # paper["Journal_IF"] = Journal.get(Journal.title.startswith(stripped_journal_name[:8])).impact_factor # elif len(stripped_journal_name) >= 4: # paper["Journal_IF"] = Journal.get(Journal.title.startswith(stripped_journal_name[:4])).impact_factor # else: # paper["Journal_IF"] = 0 # except DoesNotExist: # paper["Journal_IF"] = 0 # def ranking(self): # model = self.check_model() # if model: # clf = model[0] # number_clicks = model[1] # maximum_ml_score = -1 # for k,v in self.papers.items(): # if "Journal_IF" in v and "Year" in v: # x = [[v["Year"], v["Journal_IF"]]] # score_ml = clf.predict(x)[0] # v["Score_ML"] = score_ml # if score_ml > maximum_ml_score: # maximum_ml_score = score_ml # weight = 1 - math.pow(0.5, 0.1*number_clicks) # v["Weight"] = weight # for k,v in self.papers.items(): # if "Score_ML" in v: # v["Score_ML"] *= 1 / maximum_ml_score # # logging.debug("{}: {}".format(v["Title"], v["Score_ML"])) # v["Score"] = v["Score"]*(1-v["Weight"]) + v["Score_ML"]*v["Weight"] # else: # pass # TODO: models # def check_model(self): # ALWAYS_CREATE_NEW_MODEL_AND_DONT_SAVE = True # if ALWAYS_CREATE_NEW_MODEL_AND_DONT_SAVE: # new_model = self.train_model() # return new_model # try: # search_term = SearchTerm.get(SearchTerm.keyword == self.keyword) # model = Model.get(Model.search_term == search_term) # if datetime.datetime.now() - model.last_modified > datetime.timedelta(days = 1): # new_model = self.train_model() # if new_model: # model.model = pickle.dumps(new_model) # model.last_modified = datetime.datetime.now() # model.save() # return new_model # else: # return pickle.loads(model.model) # except DoesNotExist: # new_model = self.train_model() # if new_model: # Model.create( # search_term = SearchTerm.get(SearchTerm.keyword == self.keyword), # model = pickle.dumps(new_model) # ) # return new_model # def train_model(self): # x, y = [], [] # #clicks = SearchTerm.get(SearchTerm.keyword == self.keyword).clicks # clicks = Click.select(Paper, Click).join(Paper).switch(Click).join(SearchTerm).where(SearchTerm.keyword == self.keyword) # if clicks.count() == 0: # return False # for click in clicks: # x.append( # [ # click.paper.year, # click.paper.journal_if # ] # ) # y.append(click.click_count) # #clf = svm.SVR(kernel="rbf") # #clf.fit(x, y) # #return [clf, sum(y)] # gp = gaussian_process.GaussianProcess(theta0=1e-2, thetaL=1e-4, thetaU=1e-1) # gp.fit(x, y) # return [gp, sum(y)]