def _prepare_and_call(self, grouped_data): to_parse = {} default_searchterms = set(GOOGLE_SEARCHTERMS) for k, v in grouped_data.items(): lst = v.get('searchterms', set([])) if lst: to_parse[k] = { 'update': v.get('update'), 'searchterms': list(default_searchterms - lst) } logger.info('-' * 50) logger.info(to_parse) logger.info('-' * 50) self.write_json(to_parse) google_data = self.scrapyd_data.copy() google_data.update( spider=GOOGLE_NAME, companies=SPLITTER.join([]), json_data=self.json_path ) run_scrapy_process(self.project_name, GOOGLE_NAME, google_data) return {}
def execute_websites(self, force_update=False): companies_w_websites = self.get_wibsites_missing(force_update) self.logger.info(u"Google get websites for next companies: %s" % companies_w_websites) google_data = self.scrapyd_data.copy() google_data.update(spider=GOOGLE_NAME, companies=SPLITTER.join(companies_w_websites), only_website=True) run_scrapy_process(self.project_name, GOOGLE_NAME, google_data) del companies_w_websites
def execute_update(self, force_update=False): companies_xing_update = self.get_companies_on_update(force_update) self.logger.info(u"Update xing for next companies: %s" % companies_xing_update) xing_data = self.scrapyd_data.copy() xing_data.update(spider=XING_NAME, json_data=self.json_path) self.write_json({'manual_data': companies_xing_update}) run_scrapy_process(self.project_name, XING_NAME, xing_data) del companies_xing_update
def execute_update(self, force_update=False): companies_wiki_update = self.get_companies_on_update(force_update) self.logger.info(u"Update wikipedia for next companies: %s" % companies_wiki_update) wiki_data = self.scrapyd_data.copy() wiki_data.update( spider=WIKIPEDIA_NAME, json_data=self.json_path ) self.write_json({'manual_data': companies_wiki_update}) run_scrapy_process(self.project_name, WIKIPEDIA_NAME, wiki_data) del companies_wiki_update
def execute_search(self, force_update=False): companies_w_xing_url = self.get_missing_url(force_update) self.logger.info(u"Parse xing for next companies: %s" % companies_w_xing_url) xing_data = self.scrapyd_data.copy() xing_data.update(spider=XING_NAME, json_data=self.json_path, dont_filter=True) self.write_json({'companies': companies_w_xing_url}) run_scrapy_process(self.project_name, XING_NAME, xing_data) del companies_w_xing_url
def execute_search(self, force_update=False): companies_w_wiki_url_revenue = self.get_missing_url_revenue(force_update) self.logger.info(u"Parse wikipedia for next companies: %s" % companies_w_wiki_url_revenue) wiki_data = self.scrapyd_data.copy() wiki_data.update( spider=WIKIPEDIA_NAME, json_data=self.json_path, dont_filter=True ) self.write_json({ 'companies': companies_w_wiki_url_revenue.keys(), 'urls': companies_w_wiki_url_revenue.values() }) run_scrapy_process(self.project_name, WIKIPEDIA_NAME, wiki_data) del companies_w_wiki_url_revenue