def inference(): preprocessor = Preprocessor(first_time=False) preprocessor.preprocess() dataset = Dataset(preprocessor) mf = MF(preprocessor, dataset) mf.load() i2i = Item2Item(dataset) candidate_generator = CandidateGenerator(preprocessor, dataset, mf, i2i) ranker = Ranker() ranker.load() X_submit, X_article_nums, q_submit, q_reader = candidate_generator.generate_submit() try: with open('submit_puke.pkl', 'wb') as f: pickle.dump((X_submit, X_article_nums, q_submit, q_reader), f) except: print("Couldn't save submit_puke") # X_submit, X_article_nums, q_submit, q_reader = pickle.load(open('submit_puke.pkl', 'rb')) rank_scores = ranker.rank(X_submit) base = 0 entire_articles = [] not_heavy_items = set(range(1, article_count+1)) - set(preprocessor.heavy_items) not_heavy_items = sorted(not_heavy_items) cut = 50 random.seed(0) with result_path.open('w') as fout: for group_size, reader in tqdm(zip(q_submit, q_reader), total=len(q_submit)): articles = X_article_nums[base:base+group_size] scores = rank_scores[base:base+group_size] articles = [a for _, a in sorted(zip(scores, articles), key=lambda x: x[0], reverse=True)] articles = articles[:cut] from_followable = candidate_generator.get_readers_followable_articles(reader) # from_keywords = candidate_generator.get_readers_keyword_articles(reader) for item in from_followable: if len(articles) >= cut + 15: break if item in articles: continue articles.append(item) while len(articles) < 100: item = random.choice(not_heavy_items) if item not in articles: articles.append(item) entire_articles.extend(articles) reader_str = preprocessor.num2reader[reader] article_strs = map(preprocessor.num2article.get, articles) fout.write('%s %s\n' % (reader_str, ' '.join(article_strs))) base += group_size print('Entropy of candidates = ', entropy(entire_articles))
def __init__(self, seed_urls=None, save_html=1, use_splash=1, screenshot_dir='/memex-pinterest/ui/static/images/screenshots', op_time=10, **kwargs): ''' Constructs spider instance from command=line or scrapyd daemon. :param seed_urls: Comma-separated list of URLs, if empty crawler will be following not crawled URLs from storage :param save_html: boolean 0/1 :param use_splash: boolean 0/1 :param screenshot_dir: used only when use_splash=1 :param op_time: operating time in minutes, negative - don't use that constraint :param kwargs: :return: ''' super(TopicalFinder, self).__init__(screenshot_dir=screenshot_dir, **kwargs) self.screenshot_dir = screenshot_dir log.msg("SCREENSHOT DIR IS SET TO: %s" % str(screenshot_dir), _level=log.DEBUG) if seed_urls: self.start_urls = [add_scheme_if_missing(url) for url in seed_urls.split(',')] self.ranker = Ranker.load() self.linkextractor = LinkExtractor() self.save_html = bool(save_html) self.use_splash = bool(use_splash) self.operating_time = int(op_time) * 60 self.start_time = datetime.utcnow() self.finishing = False
def train_and_score_mongo(): """ Rescore all items from mongo """ print "**************Training*********************" train_on_user_input() print "**************Scoring and Indexing*****************" mmu = MemexMongoUtils() docs = mmu.list_all_urls_iterator(return_html = True) ranker = Ranker.load() for doc in tqdm(docs, leave = True): try: score = ranker.score_doc(doc) except: score = 0 mmu.set_score(doc["url"], score) _score_hosts()
def __init__( self, seed_urls=None, save_html=1, use_splash=1, screenshot_dir='/memex-pinterest/ui/static/images/screenshots', op_time=10, **kwargs): ''' Constructs spider instance from command=line or scrapyd daemon. :param seed_urls: Comma-separated list of URLs, if empty crawler will be following not crawled URLs from storage :param save_html: boolean 0/1 :param use_splash: boolean 0/1 :param screenshot_dir: used only when use_splash=1 :param op_time: operating time in minutes, negative - don't use that constraint :param kwargs: :return: ''' super(TopicalFinder, self).__init__(screenshot_dir=screenshot_dir, **kwargs) self.screenshot_dir = screenshot_dir log.msg("SCREENSHOT DIR IS SET TO: %s" % str(screenshot_dir), _level=log.DEBUG) if seed_urls: self.start_urls = [ add_scheme_if_missing(url) for url in seed_urls.split(',') ] self.ranker = Ranker.load() self.linkextractor = LinkExtractor() self.save_html = bool(save_html) self.use_splash = bool(use_splash) self.operating_time = int(op_time) * 60 self.start_time = datetime.utcnow() self.finishing = False