def get_valid_data(self): return Conference(title="something", url="anything", deadline=datetime.now(), metadata=Metadata(__name__, datetime.now(), "something.com/something", "something.com", "anythingProd"))
def test_insert_datetime(self): self.assertDictEqual.__self__.maxDiff = None with warnings.catch_warnings(): warnings.simplefilter("ignore") conf = Conference(title="Something", url="www.something.com", deadline=datetime.now(), metadata=Metadata("something", datetime.now(), "www.something.com", "www.something.com", "something"), dateRange=[ datetime.now(), datetime.now() + timedelta(days=10) ], finalDue=datetime.now(), location="something", categories=["something"], bulkText="somthing") flag = None is_inserted = self.mongo_db.put(conf) if (is_inserted != None): flag = True else: flag = False self.assertEqual(True, flag)
def parse_action(self): meta = Metadata(__name__, datetime.datetime.now(), website_url="somename.co.in/link", domain_url="somename.co.in", domain_name="somename", **{"extra": "info you want to keep"}) data = Conference( **{ "title": "", "url": "", "deadline": datetime.datetime.now(), "metadata": meta }) ## There are other optional fields also for conference ## check out the docstring ## Once done you can call dbaction ## Use the already provided method from Scrapper class like ## getDate , getPage etc. ## They are tested methods and have lesser chance of breaking your code. # self.getPage(" -- some page link --" , " -- some debug message --") # # PARSE DATA # # self.push_todb(data) self.logger.info( "Yay !! data was put into db hopefully !! Check error logs , i.e run with log level error" )
def get_metadata(self): worker ="worker" date_extracted ="date_extracted" website_url = "website_url" domain_url = "domain_url" domain_name = "domain_name" meta_data_obj = Metadata(worker, date_extracted, website_url, domain_url, domain_name) return meta_data_obj
def _parse_top_conference(self, link: str) -> Conference: """ Parses individual top conference page and, return Conference object Args: --- link: str Returns: --- Conference: object """ page = self.get_page(qlink=link, debug_msg=f'Parsing {link}', allow_redirects=True) try: content = page.content except Exception as e: PageParsingError( f"The following error occured while parsing {link} Trace:{e}") soup = bs(content, "html5lib") post_div = soup.find("div", attrs={"class": "single_post"}) post_tables = post_div.find_all("table") title = soup.h1.text conf_info = self._get_top_conf_info(name=title, table=post_tables[0]) rating_info = self._get_top_conf_ranking(name=title, table=post_tables[1]) categories = list(rating_info.keys())[1:-1] if rating_info else [] bulk_text = self._get_top_conf_bulk_text(soup) url = conf_info.get("link") deadline = conf_info.get("deadline") metadata = Metadata( __name__, dt.now(), link, self.base_address, self.scrapper_name, ) additional_data = {} additional_data["dateRange"] = conf_info.get("dateRange") additional_data["location"] = conf_info.get("location") self.logger.debug(f"{title} is now added to the database") if not deadline: self.logger.debug( f"{title} not added, because, deadline info not available") return None else: return Conference(title=title, url=url, deadline=deadline, metadata=metadata, bulkText=bulk_text, categories=categories, rankings=rating_info, **additional_data)
def _parse_all_conference(self, link: str): """ Parses individual conference page and, return Conference object Args: --- link: str Returns: --- Conference: object """ page = self.get_page(qlink=link, debug_msg=f'Parsing {link}', allow_redirects=True) try: content = page.content except Exception as e: self.logger.error( f"The following error occured while trying to parse {link} {e}" ) soup = bs(content, "html5lib") content_div = soup.find("div", attrs={"id": "content_box"}) title = content_div.h1.text tables = soup.find_all('table') conf_info = self._get_all_conf_info(name=title, infotable=tables[0]) bulk_text = self._get_all_conf_bulk(soup=soup) metadata = Metadata( __name__, dt.now(), link, self.base_address, self.scrapper_name, ) additional_data = {} additional_data["bulkText"] = bulk_text additional_data["dateRange"] = conf_info.get("dateRange") additional_data["location"] = conf_info.get("location") deadline = conf_info.get("deadline") self.logger.debug(f"{title} is now added to database") if not deadline: self.logger.debug( f"{title} not added because, deadline info not available") return None else: return Conference(title=title, url=conf_info.get("link"), deadline=deadline, metadata=metadata, **additional_data)
def test_invalid_datetime(self): self.assertDictEqual.__self__.maxDiff = None with warnings.catch_warnings(): warnings.simplefilter("ignore") conf = Conference(title="Something", url="www.something.com", deadline=datetime.now() - timedelta(days=10), metadata=Metadata( "something", datetime.now(), "www.something.com\somthing.html", "www.something.com", "something"), dateRange=[ datetime.now(), datetime.now() - timedelta(days=10) ], finalDue=datetime.now(), location="something", categories=["something"], bulkText="somthing") is_inserted = self.mongo_db.put(conf) self.assertEqual(None, is_inserted)
def create_metadata(self, website_url, domain_url, domain_name, **kwargs): return Metadata(__name__, datetime.datetime.now(), website_url, domain_url, domain_name, **kwargs)
def get_invalid_data(self): return Metadata(__name__, "anything", "something.com/something", "something.com", "anythingProd")
def get_valid_data(self): return Metadata(__name__, datetime.now(), "something.com/something", "something.com", "anythingProd")