def __init__(self, *args, **kwargs):
    #hack process based utils expects id to be a primary key, but we're passing in a url
    #if we just passed in url, processbased utils would not forward it onto the crawler process

    url = kwargs['id']

    #get the root domain - but this is pretty naive. consider
    #https://github.com/john-kurkowski/tldextract
    netloc_split = urlparse(url).netloc.split(".")
    if 'www' in netloc_split:
      domain = netloc_split[1]
    else:
      domain = netloc_split[0]

    listing_sources = ListingSource.objects.filter(url__icontains=domain).all()
    source_dict = {k.url: k for k in listing_sources}

    closest_url = fuzzy_search.get_closest_word(url, source_dict.keys())

    config = source_dict[closest_url].scraper_config

    self.scraper = config.scraper
    self.scrape_url = url
    self.ref_object = config

    super(IndividualListingSpider, self).__init__(*args, **kwargs)
def test_fuzzy_parser_gets_closest_source(target, sources, expected):
    actual = fuzzy_search.get_closest_word(target, sources)
    assert expected == actual