def __init__(self, *args, **kwargs): #hack process based utils expects id to be a primary key, but we're passing in a url #if we just passed in url, processbased utils would not forward it onto the crawler process url = kwargs['id'] #get the root domain - but this is pretty naive. consider #https://github.com/john-kurkowski/tldextract netloc_split = urlparse(url).netloc.split(".") if 'www' in netloc_split: domain = netloc_split[1] else: domain = netloc_split[0] listing_sources = ListingSource.objects.filter(url__icontains=domain).all() source_dict = {k.url: k for k in listing_sources} closest_url = fuzzy_search.get_closest_word(url, source_dict.keys()) config = source_dict[closest_url].scraper_config self.scraper = config.scraper self.scrape_url = url self.ref_object = config super(IndividualListingSpider, self).__init__(*args, **kwargs)
def test_fuzzy_parser_gets_closest_source(target, sources, expected): actual = fuzzy_search.get_closest_word(target, sources) assert expected == actual