def get_tag_by_link(link): url = helpers.get_url_from_link(link).strip('/').split('/')[0] if not Retriever.TAGS_MAPPING: dm = WaitingDM() print 'downloading main page of LibRu for retrieving tags...' html = dm.download( helpers.get_site_root_link(link) ) soup = get_soup(html) dirs = Retriever.get_dirs(soup) for link, tag in dirs: Retriever.TAGS_MAPPING[link.strip('/')] = tag if not Retriever.TAGS_MAPPING.has_key(url): #TODO make other way for retrieving of tags for this case logger.write_fail("LibRu parser: can't find tag in main page",link=link, url=url) return None return Retriever.TAGS_MAPPING[url]
def refresh_libru(): print 'scanning what should be refreshed...' refreshes = Refresh.objects.all() links = [] ref_dm = WaitingDM() for refresh in refreshes: headers, page = ref_dm.download_headers(refresh.link) print refresh.link, field = LAST_MODIFIED if field in headers: new_date = headers[field] if refresh.check_refreshable(new_date): links.append( (refresh.link, new_date) ) print '--> REFRESH', else: print ' no', field, 'in headers!', print if not links: print 'nothing to refresh. Everything is up-to-date' parser = LibRu() parser_name = parser.get_filename()+'_refresh' storage = TaskStorageDB( parser_name, [],Q(parser_name=parser_name, good=True) ) for link, date in links: refresh = Refresh.objects.get(link=link) refresh.delete() #TODO don't forget about make last-modified links actual # now actual is when it'll be updated by again scanning # refresh.last_modified = date task = DirPage(link) print 'adding task for refresh:', task storage.accept_new_tasks( [task] ) dm = DM_LibRu_AddRefreshLinks() tm = TM_LibRuRefresh(storage, dm) tm.run() print 'refreshing is finished'
def read_data(self, page): if self.bookfile_condition(page.url): return page.read( analyser.settings.LIBRU_DEFINE_LANG_BY_BYTES ) return WaitingDM.read_data(self,page)