def mass_evaluation(self): project_name = 'default' scrapyd_data = {'project': project_name} force_update = True query = session.query(Company.name).filter( Company.manual_entry == "Yes", ) query.update({Company.manual_entry: "manual"}, synchronize_session="fetch") session.commit() companies = [] for name in query: name = u'update_{}'.format(name[0].lower()) companies.append(name) #companies = q.get_companies_for_google_search(companies_names, force_update) #companies = SPLITTER.join(companies) logger.debug(companies) scrapyd_data.update(spider=GOOGLE_NAME, companies=companies) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("goggle")) time.sleep(5) else: time.sleep(10) break logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.sync_all()
def update_old(self): get_old_wikipedia_companies() time.sleep(10) query = session.query(WikipediaDb.company_name_w, WikipediaDb.wiki_url_w).filter( WikipediaDb.manual_entry == "old", ) print(query) existing_names = [] existing_urls = [] for name in query: existing_names.append(name[0]) existing_urls.append(name[1]) project_name = 'default' scrapyd_data = {'project': project_name} import os s_file = sys.argv logger.info(s_file) dict_names_urls = dict(zip(existing_names, existing_urls)) for name, url in dict_names_urls.iteritems(): if url == u'NA': fixing_wrong_old_wiki(name) elif url == u'N/A': fixing_wrong_old_wiki(name) elif url == u'': fixing_wrong_old_wiki(name) elif url is None: logger.info(url) logger.info(name) fixing_wrong_old_wiki(name) else: # scrapyd_data.update(spider=WIKIPEDIA_MANUAL_NAME, companies='BKK Demag Krauss-Maffei', urls='www.bkk-dkm.de') scrapyd_data.update(spider=WIKIPEDIA_MANUAL_NAME, companies=name, urls=url) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) #scrapyd_data.update(spider=WIKIPEDIA_MANUAL_NAME, companies=dict_names_urls.keys(), urls=dict_names_urls.values()) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) try: if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("wikipedia")) time.sleep(5) logger.info(resp) else: time.sleep(10) break except KeyError: if resp['status'] == u'error': time.sleep(5) logger.info(resp) logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.wiki_sync() RatingUpdate().update_squirrel_rating(existing_names)
def mass_update(self, company_name, xing_login, xing_password, new_xing_url): xing_url = new_xing_url f = open("mx_crm/manual_queries/xing_url.txt", "w") f.write(xing_url) f.close() print('*' * 50) print('Start updating xing info for company {}'.format(company_name)) query = session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x == company_name, ) query.update({XingCompanyDb.manual_entry: "ololo"}, synchronize_session="fetch") query.update({XingCompanyDb.xing_url: new_xing_url}, synchronize_session="fetch") session.commit() print('*' * 50) project_name = 'default' scrapyd_data = {'project': project_name} decode_company_name = u'{}'.format(company_name.decode('utf-8')) print decode_company_name company_name_lower = u'update_{}'.format( decode_company_name[0].lower()) update_company_name = company_name_lower + decode_company_name[1:] print(update_company_name) companies_names = [] force_update = True companies_names.append(decode_company_name.lower()) print('Start parsing given xing url {}'.format(xing_url)) companies = q.get_companies_for_xing(companies_names, force_update) companies = SPLITTER.join(companies) scrapyd_data.update(spider=XING_NAME, companies=companies, login=xing_login, password=xing_password) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: from mx_crm.utils import get_scrapyd_jobs resp = get_scrapyd_jobs(project_name) if not len(resp['finished']): time.sleep(3) else: break requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: from mx_crm.utils import get_scrapyd_jobs resp = get_scrapyd_jobs(project_name) if not len(resp['finished']): time.sleep(3) else: break logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.xing_sync()
def update_old(self): get_old_xing_companies() time.sleep(10) xing_login = '******' xing_password = '******' query = session.query(XingCompanyDb.company_name_x, XingCompanyDb.xing_url).filter( XingCompanyDb.manual_entry == "old", ) existing_names = [] existing_urls = [] for name in query: existing_names.append(name[0]) existing_urls.append(name[1]) project_name = 'default' scrapyd_data = {'project': project_name} #pprint(existing_names) dict_names_urls = dict(zip(existing_names, existing_urls)) #pprint('dict_names_urls') #pprint(dict_names_urls) for name, url in dict_names_urls.iteritems(): #pprint(url) if url == 'NA': fixing_wrong_old(name) if url == 'https://www.xing.com/companies': fixing_wrong_old(name) if url is None: fixing_wrong_old(name) scrapyd_data.update( spider=XING_MANUAL_NAME, companies=name, urls=url, #scrapyd_data.update(spider=XING_MANUAL_NAME, companies='AVL Iberica S.A.', urls='www.avl.de/karriere', login=xing_login, password=xing_password) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("xing")) time.sleep(5) else: time.sleep(10) break logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.xing_sync() RatingUpdate().update_squirrel_rating(existing_names)
def update_wikipedia_url(company_name, wikipedia_url): print('*' * 50) print( 'Start updating wikipedia url for company {}'.format(company_name)) print('New url is {}'.format(wikipedia_url)) query = session.query(WikipediaDb).filter( WikipediaDb.company_name_w == company_name, ) query.update({WikipediaDb.wiki_url_w: wikipedia_url}, synchronize_session="fetch") query.update({WikipediaDb.manual_entry: "Yes"}, synchronize_session="fetch") session.commit() print( 'New wikipedia url ({0}) for company {1} have successful updated'. format(wikipedia_url, company_name)) print('*' * 50) print('Start parsing page {}'.format(wikipedia_url)) print('*' * 50) companies_dict = {company_name: wikipedia_url} print companies_dict project_name = 'default' scrapyd_data = {'project': project_name} decode_company_name = u'{}'.format(company_name.decode('utf-8')) print decode_company_name company_name_lower = u'update_{}'.format( decode_company_name[0].lower()) update_company_name = company_name_lower + decode_company_name[1:] print(update_company_name) scrapyd_data.update(spider=WIKIPEDIA_NAME, companies=update_company_name, urls=wikipedia_url) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: from mx_crm.utils import get_scrapyd_jobs resp = get_scrapyd_jobs(project_name) print(resp) if len(resp['finished']) >= 1: break time.sleep(5) logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.wiki_sync()
def update_wiki_company(self, company_name, wikipedia_url): company_name_for_file = u'{}'.format(company_name.decode('utf-8')) company_name = [company_name.lower()] wiki_url = wikipedia_url f = open("mx_crm/manual_queries/wiki_url.txt", "w") f.write(wiki_url.encode("utf-8")) f.close() f = io.open("mx_crm/manual_queries/wiki_company_name.txt", "w", encoding="utf-8") f.write(company_name_for_file) f.close() print('*' * 50) print('Start updating wikipedia info for company {}'.format( company_name[0])) query = session.query(WikipediaDb).filter( WikipediaDb.company_name_w == company_name[0], ) query.update({WikipediaDb.manual_entry: "manual"}, synchronize_session="fetch") session.commit() print('*' * 50) print('Start parsing given wiki url {}'.format(wiki_url)) print('*' * 50) project_name = 'default' scrapyd_data = {'project': project_name} companies_dict = q.get_companies_for_wikipedia(company_name, True) companies = companies_dict.iterkeys() companies = SPLITTER.join(companies) urls = companies_dict.values() urls = SPLITTER.join(urls) scrapyd_data.update(spider=WIKIPEDIA_NAME, companies=companies, urls=urls) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("wikipedia")) time.sleep(5) else: time.sleep(10) break logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.wiki_sync()
def update_old(self): # --current-date=2019-03-14 --current-time=20:00 --last-date=2019-03-08 --last-time=19:59 --spider="report" get_old_google_companies() time.sleep(10) query = session.query(Company.name, Company.website).filter( Company.manual_entry == "old", ) existing_names = [] existing_urls = [] for name in query[:1]: existing_names.append(name[0]) existing_urls.append(name[1]) project_name = 'default' scrapyd_data = {'project': project_name} import os s_file = sys.argv logger.info(s_file) dict_names_urls = dict(zip(existing_names, existing_urls)) for name, url in dict_names_urls.iteritems(): companies = u'update_{}'.format(name.lower()) logger.info(companies) scrapyd_data.update(spider=GOOGLE_NAME, companies=companies) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) try: if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("google")) time.sleep(5) logger.info(resp) else: time.sleep(10) break except KeyError: if resp['status'] == u'error': time.sleep(5) logger.info(resp) logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.sync_all() RatingUpdate().update_squirrel_rating(existing_names)
def manual_update(self): query = session.query(WikipediaDb.company_name_w, WikipediaDb.wiki_url_w).filter( WikipediaDb.manual_entry == "Yes", ) existing_names = [] existing_urls = [] for name in query: existing_names.append(name[0]) existing_urls.append(name[1]) project_name = 'default' scrapyd_data = {'project': project_name} import os s_file = sys.argv logger.info(s_file) # dict_names_urls = dict(zip('sdfsdf', 'dsfsdf.com')) dict_names_urls = dict(zip(existing_names, existing_urls)) for name, url in dict_names_urls.iteritems(): scrapyd_data.update(spider=WIKIPEDIA_MANUAL_NAME, companies=name, urls=url) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) try: if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("wikipedia")) time.sleep(5) logger.info(resp) else: time.sleep(10) break except KeyError: if resp['status'] == u'error': time.sleep(5) logger.info(resp) logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.wiki_sync() RatingUpdate().update_squirrel_rating(existing_names)
def manual_update(self): xing_login = '******' xing_password = '******' query = session.query(XingCompanyDb.company_name_x, XingCompanyDb.xing_url).filter( XingCompanyDb.manual_entry == "Yes", ) existing_names = [] existing_urls = [] for name in query: existing_names.append(name[0]) existing_urls.append(name[1]) project_name = 'default' scrapyd_data = {'project': project_name} pprint(existing_names) dict_names_urls = dict(zip(existing_names, existing_urls)) for name, url in dict_names_urls.iteritems(): #scrapyd_data.update(spider=XING_MANUAL_NAME, companies='Ckw Centralschweizerische Kraftwerke', urls='https://www.xing.com/companies/ckw', scrapyd_data.update(spider=XING_MANUAL_NAME, companies=name, urls=url, login=xing_login, password=xing_password) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("xing")) time.sleep(5) else: time.sleep(10) break logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.xing_sync() RatingUpdate().update_squirrel_rating(existing_names)
def manual_update(self): query = session.query(Company.name, Company.website).filter( Company.manual_entry == "Yes", ) existing_names = [] existing_urls = [] for name in query: existing_names.append(name[0]) existing_urls.append(name[1]) project_name = 'default' scrapyd_data = {'project': project_name} # dict_names_urls = dict(zip(['bueroservice 99 gmbh'], [])) dict_names_urls = dict(zip(existing_names, existing_urls)) little_list_force_update = [] for company in existing_names: company = u'update_{}'.format(company) little_list_force_update.append(company) little_list_force_update = SPLITTER.join(little_list_force_update) logger.debug(little_list_force_update) scrapyd_data.update(spider=GOOGLE_NAME, companies=little_list_force_update) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) #for name, url in dict_names_urls.iteritems(): # pprint(name) # scrapyd_data.update(spider=GOOGLE_NAME, companies=name) # here used a comon google parser such as for casual parsing # requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("google")) time.sleep(5) else: time.sleep(10) break logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.sync_all() RatingUpdate().update_squirrel_rating(existing_names)
def sync_resources(): RS = ResourceSync() RS.sync_all()