def mass_update(self, company_name, xing_login, xing_password, new_xing_url): xing_url = new_xing_url f = open("mx_crm/manual_queries/xing_url.txt", "w") f.write(xing_url) f.close() print('*' * 50) print('Start updating xing info for company {}'.format(company_name)) query = session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x == company_name, ) query.update({XingCompanyDb.manual_entry: "ololo"}, synchronize_session="fetch") query.update({XingCompanyDb.xing_url: new_xing_url}, synchronize_session="fetch") session.commit() print('*' * 50) project_name = 'default' scrapyd_data = {'project': project_name} decode_company_name = u'{}'.format(company_name.decode('utf-8')) print decode_company_name company_name_lower = u'update_{}'.format( decode_company_name[0].lower()) update_company_name = company_name_lower + decode_company_name[1:] print(update_company_name) companies_names = [] force_update = True companies_names.append(decode_company_name.lower()) print('Start parsing given xing url {}'.format(xing_url)) companies = q.get_companies_for_xing(companies_names, force_update) companies = SPLITTER.join(companies) scrapyd_data.update(spider=XING_NAME, companies=companies, login=xing_login, password=xing_password) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: from mx_crm.utils import get_scrapyd_jobs resp = get_scrapyd_jobs(project_name) if not len(resp['finished']): time.sleep(3) else: break requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: from mx_crm.utils import get_scrapyd_jobs resp = get_scrapyd_jobs(project_name) if not len(resp['finished']): time.sleep(3) else: break logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.xing_sync()
def mass_evaluation(self): project_name = 'default' scrapyd_data = {'project': project_name} force_update = True query = session.query(Company.name).filter( Company.manual_entry == "Yes", ) query.update({Company.manual_entry: "manual"}, synchronize_session="fetch") session.commit() companies = [] for name in query: name = u'update_{}'.format(name[0].lower()) companies.append(name) #companies = q.get_companies_for_google_search(companies_names, force_update) #companies = SPLITTER.join(companies) logger.debug(companies) scrapyd_data.update(spider=GOOGLE_NAME, companies=companies) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("goggle")) time.sleep(5) else: time.sleep(10) break logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.sync_all()
def tet(): force_update = True company_name = ['50Hertz Transmission GmbH'.decode('utf-8')] company_name = map(lambda c: c.lower(), company_name) logger.debug('Found companies: {}'.format(company_name)) updated_name = u'update_{}'.format(company_name[0]) companies = [] companies.append(updated_name) url = "https://de.wikipedia.org/wiki/50Hertz_Transmission" # urls = [] # urls.append(url) print updated_name project_name = 'default' scrapyd_data = {'project': project_name} companies_dict = q.get_companies_for_wikipedia(company_name, force_update) logger.debug(companies_dict) companies = companies_dict.iterkeys() companies = SPLITTER.join(companies) logger.debug(companies) urls = companies_dict.itervalues() scrapyd_data.update(spider=WIKIPEDIA_NAME, companies=companies, urls=urls) print scrapyd_data requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if not len(resp['finished']): time.sleep(3) else: break
def update_old(self): get_old_wikipedia_companies() time.sleep(10) query = session.query(WikipediaDb.company_name_w, WikipediaDb.wiki_url_w).filter( WikipediaDb.manual_entry == "old", ) print(query) existing_names = [] existing_urls = [] for name in query: existing_names.append(name[0]) existing_urls.append(name[1]) project_name = 'default' scrapyd_data = {'project': project_name} import os s_file = sys.argv logger.info(s_file) dict_names_urls = dict(zip(existing_names, existing_urls)) for name, url in dict_names_urls.iteritems(): if url == u'NA': fixing_wrong_old_wiki(name) elif url == u'N/A': fixing_wrong_old_wiki(name) elif url == u'': fixing_wrong_old_wiki(name) elif url is None: logger.info(url) logger.info(name) fixing_wrong_old_wiki(name) else: # scrapyd_data.update(spider=WIKIPEDIA_MANUAL_NAME, companies='BKK Demag Krauss-Maffei', urls='www.bkk-dkm.de') scrapyd_data.update(spider=WIKIPEDIA_MANUAL_NAME, companies=name, urls=url) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) #scrapyd_data.update(spider=WIKIPEDIA_MANUAL_NAME, companies=dict_names_urls.keys(), urls=dict_names_urls.values()) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) try: if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("wikipedia")) time.sleep(5) logger.info(resp) else: time.sleep(10) break except KeyError: if resp['status'] == u'error': time.sleep(5) logger.info(resp) logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.wiki_sync() RatingUpdate().update_squirrel_rating(existing_names)
def update_old(self): get_old_xing_companies() time.sleep(10) xing_login = '******' xing_password = '******' query = session.query(XingCompanyDb.company_name_x, XingCompanyDb.xing_url).filter( XingCompanyDb.manual_entry == "old", ) existing_names = [] existing_urls = [] for name in query: existing_names.append(name[0]) existing_urls.append(name[1]) project_name = 'default' scrapyd_data = {'project': project_name} #pprint(existing_names) dict_names_urls = dict(zip(existing_names, existing_urls)) #pprint('dict_names_urls') #pprint(dict_names_urls) for name, url in dict_names_urls.iteritems(): #pprint(url) if url == 'NA': fixing_wrong_old(name) if url == 'https://www.xing.com/companies': fixing_wrong_old(name) if url is None: fixing_wrong_old(name) scrapyd_data.update( spider=XING_MANUAL_NAME, companies=name, urls=url, #scrapyd_data.update(spider=XING_MANUAL_NAME, companies='AVL Iberica S.A.', urls='www.avl.de/karriere', login=xing_login, password=xing_password) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("xing")) time.sleep(5) else: time.sleep(10) break logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.xing_sync() RatingUpdate().update_squirrel_rating(existing_names)
def update_wikipedia_url(company_name, wikipedia_url): print('*' * 50) print( 'Start updating wikipedia url for company {}'.format(company_name)) print('New url is {}'.format(wikipedia_url)) query = session.query(WikipediaDb).filter( WikipediaDb.company_name_w == company_name, ) query.update({WikipediaDb.wiki_url_w: wikipedia_url}, synchronize_session="fetch") query.update({WikipediaDb.manual_entry: "Yes"}, synchronize_session="fetch") session.commit() print( 'New wikipedia url ({0}) for company {1} have successful updated'. format(wikipedia_url, company_name)) print('*' * 50) print('Start parsing page {}'.format(wikipedia_url)) print('*' * 50) companies_dict = {company_name: wikipedia_url} print companies_dict project_name = 'default' scrapyd_data = {'project': project_name} decode_company_name = u'{}'.format(company_name.decode('utf-8')) print decode_company_name company_name_lower = u'update_{}'.format( decode_company_name[0].lower()) update_company_name = company_name_lower + decode_company_name[1:] print(update_company_name) scrapyd_data.update(spider=WIKIPEDIA_NAME, companies=update_company_name, urls=wikipedia_url) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: from mx_crm.utils import get_scrapyd_jobs resp = get_scrapyd_jobs(project_name) print(resp) if len(resp['finished']) >= 1: break time.sleep(5) logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.wiki_sync()
def google_update(self, google_names_urls): project_name = 'default' scrapyd_data = {'project': project_name} for name, url in google_names_urls.iteritems(): scrapyd_data.update( spider=GOOGLE_NAME, companies=name ) # here used a comon google parser such as for casual parsing requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("google")) time.sleep(5) else: time.sleep(10) break
def update_wiki_company(self, company_name, wikipedia_url): company_name_for_file = u'{}'.format(company_name.decode('utf-8')) company_name = [company_name.lower()] wiki_url = wikipedia_url f = open("mx_crm/manual_queries/wiki_url.txt", "w") f.write(wiki_url.encode("utf-8")) f.close() f = io.open("mx_crm/manual_queries/wiki_company_name.txt", "w", encoding="utf-8") f.write(company_name_for_file) f.close() print('*' * 50) print('Start updating wikipedia info for company {}'.format( company_name[0])) query = session.query(WikipediaDb).filter( WikipediaDb.company_name_w == company_name[0], ) query.update({WikipediaDb.manual_entry: "manual"}, synchronize_session="fetch") session.commit() print('*' * 50) print('Start parsing given wiki url {}'.format(wiki_url)) print('*' * 50) project_name = 'default' scrapyd_data = {'project': project_name} companies_dict = q.get_companies_for_wikipedia(company_name, True) companies = companies_dict.iterkeys() companies = SPLITTER.join(companies) urls = companies_dict.values() urls = SPLITTER.join(urls) scrapyd_data.update(spider=WIKIPEDIA_NAME, companies=companies, urls=urls) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("wikipedia")) time.sleep(5) else: time.sleep(10) break logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.wiki_sync()
def import_companies_update(self): old_companies = q.get_imported_companies_older_than_one_year() pprint(old_companies[:2]) little_list = old_companies[:2] project_name = 'default' scrapyd_data = {'project': project_name} for name in little_list: scrapyd_data.update( spider=GOOGLE_NAME, companies=name ) # here used a comon google parser such as for casual parsing requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("google")) time.sleep(5) else: time.sleep(10) break
def wikipedia_manual(single_name, single_url, file, force_update): parse_data = {} companies_names = [] if single_name and single_url: parse_data[single_name] = single_url companies_names.append(single_name) if file: try: wiki_wb = load_workbook(filename=file) except IOError: raise Exception('File "%s" does not exist!!!' % file) wiki_ws = wiki_wb.worksheets[0] rows = wiki_ws.rows rows.next() for row in rows: key = u'update_' + row[0].value if force_update else row[0].value parse_data[key] = row[1].value companies_names.append(row[0].value) project_name = 'default' json_path = json_data_path('manual_wiki_data.json') scrapyd_data = {'project': project_name} scrapyd_data.update(spider=WIKIPEDIA_NAME, json_data=json_path, is_manual_update_wiki=True) with open(json_path, 'w') as f: f.write(json.dumps({'manual_data': parse_data})) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['finished']) >= 1: break time.sleep(5) logger.info('Updating resources...') sync_resources() if parse_data: logger.info('Creating all companies report...') all_companies_main(companies_names)
def update_old(self): # --current-date=2019-03-14 --current-time=20:00 --last-date=2019-03-08 --last-time=19:59 --spider="report" get_old_google_companies() time.sleep(10) query = session.query(Company.name, Company.website).filter( Company.manual_entry == "old", ) existing_names = [] existing_urls = [] for name in query[:1]: existing_names.append(name[0]) existing_urls.append(name[1]) project_name = 'default' scrapyd_data = {'project': project_name} import os s_file = sys.argv logger.info(s_file) dict_names_urls = dict(zip(existing_names, existing_urls)) for name, url in dict_names_urls.iteritems(): companies = u'update_{}'.format(name.lower()) logger.info(companies) scrapyd_data.update(spider=GOOGLE_NAME, companies=companies) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) try: if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("google")) time.sleep(5) logger.info(resp) else: time.sleep(10) break except KeyError: if resp['status'] == u'error': time.sleep(5) logger.info(resp) logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.sync_all() RatingUpdate().update_squirrel_rating(existing_names)
def aaa(): company_name = ["Techem Energy Services GmbH"] project_name = 'default' scrapyd_data = {'project': project_name} companies = q.get_companies_for_xing(company_name, True) companies = SPLITTER.join(companies) logger.debug(companies) #scrapyd_data.update(spider="xing", companies=companies, login="******", password="******") scrapyd_data.update(spider=XING_NAME, companies=companies, login="******", password="******") logger.info(scrapyd_data) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if not len(resp['finished']): time.sleep(3) else: break
def manual_update(self): query = session.query(WikipediaDb.company_name_w, WikipediaDb.wiki_url_w).filter( WikipediaDb.manual_entry == "Yes", ) existing_names = [] existing_urls = [] for name in query: existing_names.append(name[0]) existing_urls.append(name[1]) project_name = 'default' scrapyd_data = {'project': project_name} import os s_file = sys.argv logger.info(s_file) # dict_names_urls = dict(zip('sdfsdf', 'dsfsdf.com')) dict_names_urls = dict(zip(existing_names, existing_urls)) for name, url in dict_names_urls.iteritems(): scrapyd_data.update(spider=WIKIPEDIA_MANUAL_NAME, companies=name, urls=url) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) try: if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("wikipedia")) time.sleep(5) logger.info(resp) else: time.sleep(10) break except KeyError: if resp['status'] == u'error': time.sleep(5) logger.info(resp) logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.wiki_sync() RatingUpdate().update_squirrel_rating(existing_names)
def manual_update(self): xing_login = '******' xing_password = '******' query = session.query(XingCompanyDb.company_name_x, XingCompanyDb.xing_url).filter( XingCompanyDb.manual_entry == "Yes", ) existing_names = [] existing_urls = [] for name in query: existing_names.append(name[0]) existing_urls.append(name[1]) project_name = 'default' scrapyd_data = {'project': project_name} pprint(existing_names) dict_names_urls = dict(zip(existing_names, existing_urls)) for name, url in dict_names_urls.iteritems(): #scrapyd_data.update(spider=XING_MANUAL_NAME, companies='Ckw Centralschweizerische Kraftwerke', urls='https://www.xing.com/companies/ckw', scrapyd_data.update(spider=XING_MANUAL_NAME, companies=name, urls=url, login=xing_login, password=xing_password) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("xing")) time.sleep(5) else: time.sleep(10) break logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.xing_sync() RatingUpdate().update_squirrel_rating(existing_names)
def lll(): company_name = ["element GmbH".lower()] project_name = 'default' scrapyd_data = {'project': project_name} companies_dict = q.get_companies_for_wikipedia(company_name, True) companies = companies_dict.iterkeys() companies = SPLITTER.join(companies) urls = companies_dict.values() urls = SPLITTER.join(urls) print urls print companies scrapyd_data.update(spider=WIKIPEDIA_NAME, companies=companies, urls=urls) print scrapyd_data requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("wikipedia")) time.sleep(5) else: time.sleep(10) break
def manual_update(self): query = session.query(Company.name, Company.website).filter( Company.manual_entry == "Yes", ) existing_names = [] existing_urls = [] for name in query: existing_names.append(name[0]) existing_urls.append(name[1]) project_name = 'default' scrapyd_data = {'project': project_name} # dict_names_urls = dict(zip(['bueroservice 99 gmbh'], [])) dict_names_urls = dict(zip(existing_names, existing_urls)) little_list_force_update = [] for company in existing_names: company = u'update_{}'.format(company) little_list_force_update.append(company) little_list_force_update = SPLITTER.join(little_list_force_update) logger.debug(little_list_force_update) scrapyd_data.update(spider=GOOGLE_NAME, companies=little_list_force_update) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) #for name, url in dict_names_urls.iteritems(): # pprint(name) # scrapyd_data.update(spider=GOOGLE_NAME, companies=name) # here used a comon google parser such as for casual parsing # requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("google")) time.sleep(5) else: time.sleep(10) break logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.sync_all() RatingUpdate().update_squirrel_rating(existing_names)
def test(single_name, single_url, force_update): parse_data = {} companies_names = [] if single_name and single_url: parse_data[single_name] = single_url companies_names.append(single_name) project_name = 'default' scrapyd_data = {'project': project_name} scrapyd_data.update(spider=WIKIPEDIA_NAME, is_manual_update_wiki=True) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['finished']) >= 1: break time.sleep(5) logger.info('Updating resources...') sync_resources()
def tet(): project_name = 'default' scrapyd_data = {'project': project_name} companies_names = [] c_name = "50Hertz Transmission GmbH" companies_names.append(c_name) force_update = True companies_dict = q.get_companies_for_wikipedia(companies_names, force_update) logger.debug(companies_dict) companies = companies_dict.iterkeys() companies = SPLITTER.join(companies) logger.debug(companies) urls = companies_dict.itervalues() scrapyd_data.update(spider=WIKIPEDIA_NAME, companies=companies, urls=urls) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if not len(resp['finished']): time.sleep(3) else: break
def update_info(): companies_names = ['50Hertz Transmission GmbH'.decode('utf-8')] companies_names = map(lambda c: c.lower(), companies_names) logger.debug('Found companies: {}'.format(companies_names)) project_name = 'default' scrapyd_data = {'project': project_name} force_update = True spider = 'wikipedia' xing_login = "******" xing_password = "******" if spider == "wikipedia": companies_dict = q.get_companies_for_wikipedia( companies_names, force_update) logger.debug(companies_dict) companies = companies_dict.iterkeys() companies = SPLITTER.join(companies) logger.debug(companies) urls = list(companies_dict.itervalues()) print companies_dict scrapyd_data.update(spider=WIKIPEDIA_NAME, companies=companies) print "@@@@@@@@@@@@@@@@@@@" print scrapyd_data requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if not len(resp['finished']): time.sleep(3) else: break elif spider == "xing": companies = q.get_companies_for_xing(companies_names, force_update) companies = SPLITTER.join(companies) logger.debug(companies) scrapyd_data.update(spider=XING_NAME, companies=companies, login=xing_login, password=xing_password) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if not len(resp['finished']): time.sleep(3) else: break else: spiders = OrderedDict() spiders[WIKIPEDIA_NAME] = [] spiders[XING_NAME] = q.get_companies_for_xing( companies_names, force_update) for spider_name, companies in spiders.items(): if spider_name == WIKIPEDIA_NAME: companies = q.get_companies_for_wikipedia( companies_names, force_update) logger.debug('{} spider has started'.format(spider_name)) logger.debug(companies) post_data = scrapyd_data.copy() post_data.update(spider=spider_name) if spider_name == WIKIPEDIA_NAME: companies_dict = companies.copy() companies = SPLITTER.join(companies_dict.iterkeys()) urls = SPLITTER.join(companies_dict.itervalues()) post_data.update(companies=companies, urls=urls) else: companies = SPLITTER.join(companies) post_data.update(companies=companies, login=xing_login, password=xing_password) requests.post(SCRAPYD_SCHEDULE_URL, post_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['pending']) or len(resp['running']): logger.debug( '{} spider still working'.format(spider_name)) time.sleep(5) else: time.sleep(10) break while True: resp = get_scrapyd_jobs(project_name) if len(resp['finished']) < len(spiders): time.sleep(3) else: break logger.info('Updating resources...') sync_resources()
def main(days, allow_import, force_update, import_file, db_update, spider, xing_login, xing_password, **kwargs): """ Main function. :param days: days to extract requests from :param force_update: force update companies info in database from spiders :param allow_import: allows import :param import_file: path to xlsx file with the list of companies :param db_update: update info for all database companies :param spider: spider name :param xing_login: username/email for xing login :param xing_password: password for xing login """ logger.info("Synchronize accesslogs with remote DB.") if os.name == 'nt': accesslog_sync() drupal_companies = None if allow_import: companies_names = XlsxImport(import_file, force_update=force_update).run() logger.info( "IIIIIIIIMMMMMMMMPPPPPPPPPOOOOOOOOOORRRRRRRRRRRRTTTTTTTTTT") logger.info(companies_names) elif db_update: force_update = True companies_names = q.get_all_companies_names() #companies_names = ['eto gruppe beteiligungen kg'.decode('utf-8')] else: q.update_db_hosts() start_date, end_date = prepare_date_to_drupal_execute(days, **kwargs) google_analytics_companies = q.get_google_analytics_sessions( end_date, start_date, True) drupal_companies = q.get_drupal_sessions(end_date, start_date) companies_names = drupal_companies.keys() dates = {'end_date': end_date, 'start_date': start_date} # #companies_names = ['eto gruppe beteiligungen kg'.decode('utf-8')] companies_names = map(lambda c: c.lower(), companies_names) logger.debug('Found companies: {}'.format(companies_names)) project_name = 'default' scrapyd_data = {'project': project_name} if spider == GOOGLE_NAME: companies = q.get_companies_for_google_search(companies_names, force_update) logger.info(companies) companies = SPLITTER.join(companies) logger.debug(companies) scrapyd_data.update(spider=GOOGLE_NAME, companies=companies) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) logger.info(resp) if not len(resp['finished']): time.sleep(3) else: break elif spider == GOOGLE_IMPORT: companies = q.get_imported_companies_older_than_one_year() little_list = companies[:75] little_list_force_update = [] for company in little_list: company = u'update_{}'.format(company) little_list_force_update.append(company) logger.debug(little_list_force_update) little_list_force_update = SPLITTER.join(little_list_force_update) logger.debug(little_list_force_update) scrapyd_data.update(spider=GOOGLE_NAME, companies=little_list_force_update) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) logger.info(resp) if not len(resp['finished']): time.sleep(3) else: break elif spider == WIKIPEDIA_NAME: companies_dict = q.get_companies_for_wikipedia(companies_names, force_update) logger.debug(companies_dict) companies = companies_dict.iterkeys() companies = SPLITTER.join(companies) logger.debug(companies) urls = companies_dict.itervalues() scrapyd_data.update(spider=WIKIPEDIA_NAME, companies=companies, urls=urls) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if not len(resp['finished']): time.sleep(3) else: break elif spider == WIKIPEDIA_FIXING: companies_names = q.get_manual_wikipedia_companies() force_update_companies = [] for company in companies_names: company = u'update_{}'.format(company) force_update_companies.append(company) logger.debug(force_update_companies) force_update_companies = SPLITTER.join(force_update_companies) logger.debug(force_update_companies) urls = q.get_websites_wikipedia(companies_names) q.set_wikipedia_manual_entry_manual(companies_names) scrapyd_data.update(spider=WIKIPEDIA_NAME, companies=force_update_companies, urls=urls) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if not len(resp['finished']): time.sleep(3) else: break elif spider == XING_NAME: companies = q.get_companies_for_xing(companies_names, force_update) companies = SPLITTER.join(companies) logger.debug(companies) scrapyd_data.update(spider=XING_NAME, companies=companies, login=xing_login, password=xing_password) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if not len(resp['finished']): time.sleep(3) else: break else: spiders = OrderedDict() spiders[GOOGLE_NAME] = q.get_companies_for_google_search( companies_names, force_update) spiders[WIKIPEDIA_NAME] = [] # spiders[XING_NAME] = q.get_companies_for_xing([u'amazon.com inc.'], force_update) spiders[XING_NAME] = q.get_companies_for_xing(companies_names, force_update) # update_google_analytics_companies = force_update_google_analytics_companies(google_analytics_companies) google_name_list = list(spiders[GOOGLE_NAME]) # google_name_list += update_google_analytics_companies # spiders[GOOGLE_NAME] = ([u'update_amazon.com inc.']) spiders[GOOGLE_NAME] = (google_name_list) # print(spiders[GOOGLE_NAME]) for spider_name, companies in spiders.items(): logger.info(spider_name) if spider_name == WIKIPEDIA_NAME: # companies = q.get_companies_for_wikipedia([u'amazon.com inc.'], force_update) companies = q.get_companies_for_wikipedia( companies_names, force_update) logger.debug('{} spider has started'.format(spider_name)) logger.debug(companies) post_data = scrapyd_data.copy() post_data.update(spider=spider_name) if spider_name == WIKIPEDIA_NAME: companies_dict = companies.copy() companies = SPLITTER.join(companies_dict.iterkeys()) urls = SPLITTER.join(companies_dict.itervalues()) post_data.update(companies=companies, urls=urls) else: companies = SPLITTER.join(companies) post_data.update(companies=companies, login=xing_login, password=xing_password) requests.post(SCRAPYD_SCHEDULE_URL, post_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format(spider_name)) time.sleep(5) else: time.sleep(10) break while True: resp = get_scrapyd_jobs(project_name) if len(resp['finished']) < len(spiders): time.sleep(3) else: break logger.info('Updating resources...') sync_resources() if drupal_companies: logger.info('Creating drupal report...') me = MatchExecutor() me.create_report(drupal_companies, google_analytics_companies, dates=dates) elif allow_import and companies_names: logger.info('Creating all companies report...') all_companies_main(companies_names)