def go_to_next_page(self, html_element, parent, predicted): """ Executes action of going to the next page """ if predicted == self.rule_provider.get_mapping(m.next_page): try: first_a_html_element_inside_whole = html_element.findAll( "a")[0] link = first_a_html_element_inside_whole['href'] self.logger_dbg.info("Going to next page: " + str(parent) + " unwrapped url: " + link) yield scrapy.Request(url=build_link(self.base_domain, link), callback=self.parse, meta={'parent': parent}) except BaseException as e: self.logger_dbg.error("Couldn't go to next page of: " + str(parent) + " due to: " + str(e)) self.logger_dbg.error("Element that caused the problem: " + str(html_element)) elif predicted == self.rule_provider.get_mapping(m.next_page_link): self.logger_dbg.info("Going to next page: " + str(parent) + " url: " + html_element['href']) yield scrapy.Request(url=build_link(self.base_domain, html_element['href']), callback=self.parse, meta={'parent': parent})
def test_given_domain_and_link_when_building_link_proper_build_invision(self): domain = 'http://www.uk420.com/boards/' link = 'http://www.uk420.com/boards/index.php?/forum/103-outdoor-growing/' res = html_util.build_link(domain,link) self.assertEqual(res,link)
def test_given_url_from_without_domain_when_checking_true(self): domain = 'https://www.forum.haszysz.com/' link = 'forumdisplay.php?97-Hodowla' res = html_util.build_link(domain, link) self.assertTrue(res)
def test_given_domain_and_link_when_building_link_proper_build_vbulletin(self): domain = 'https://www.forum.haszysz.com/' link = 'forumdisplay.php?97-Hodowla' res = html_util.build_link(domain,link) self.assertEqual(res,domain+link)
def test_given_domain_and_link_when_building_link_proper_build_phpbb(self): domain = 'https://forum.vwgolf.pl/' link = './viewforum.php?f=157&sid=339596b98a9c27072f8ed07d68be22cd' res = html_util.build_link(domain,link) self.assertEqual(res,domain+link)
def parse_categories(self, html_element, predicted, parent): """ Executes action of parsing the categories """ category = None """ Title found""" if predicted == self.rule_provider.get_mapping(m.category_title): link = html_element['href'] title = str(html_element.contents[0]) category = self.repository.save_category(title, link, parent, self.forum) self.logger_dbg.info(title + " " + self.base_domain + link) """ Unwrapping needed """ if predicted == self.rule_provider.get_mapping(m.category_whole): try: first_a_html_element_inside_whole = html_element.findAll( "a")[0] link = first_a_html_element_inside_whole['href'] title = str(first_a_html_element_inside_whole.contents[0]) category = self.repository.save_category( title, link, parent, self.forum) self.logger_dbg.info(title + " " + self.base_domain + link) except BaseException as e: self.logger_dbg.error(str(e)) self.logger_dbg.error("Can't find category inside: " + str(html_element)) if category is not None and html_util.url_not_from_other_domain( category.link, self.base_domain): yield scrapy.Request(url=build_link(self.base_domain, category.link), callback=self.parse, meta={'parent': category})
def prepare_strategy(self, spider): """ Read all the categories to scrap from config file """ config_file = pd.read_csv("config/categories.csv", sep=';') category_ids = set(config_file['category_id']) categories = self.repository.get_categories(category_ids) base_link = self.forum.link self.strategy_initialized = True for category in categories: yield scrapy.Request(url=build_link(base_link, category.link), callback=spider.parse, meta={'parent': category})
def parse_topics(self, html_element, parent): author = None date = None link = None title = None for tag in self.rule_provider.possible_tags_topics: elements_inside_tag = html_element.findAll(tag) for elem in elements_inside_tag: if html_util.element_has_css_class(elem): predicted = self.rule_provider.predict(tag, elem["class"]) if predicted == self.rule_provider.get_mapping( m.topic_title): title = elem.contents[0] link = elem['href'] self.logger_dbg.info(title + " " + link) if predicted == self.rule_provider.get_mapping(m.author): author = elem.contents[0] if predicted == self.rule_provider.get_mapping( m.topic_date): date = dpt.parse_date(elem.contents) """ Additional check english speaking invision """ time_tags = html_element.findAll("time") if len(time_tags) > 0: date = dpt.parse_english_date(time_tags[0].contents) link = html_element.findAll('a')[0]['href'] title = html_element.findAll('a')[0]['title'] if title is None or link is None: self.logger_dbg.info("Can't find topic inside: " + str(html_element)) return if not filtering.topic_meets_criterion(title, author, date): return topic = self.repository.save_topic(author, date, link, parent, title) self.logger_dbg.info("Scrapped topic: " + str(topic)) yield scrapy.Request(dont_filter=True, url=build_link(self.base_domain, topic.link), callback=self.parse, meta={'parent': topic})