예제 #1
0
 def mass_evaluation(self):
     project_name = 'default'
     scrapyd_data = {'project': project_name}
     force_update = True
     query = session.query(Company.name).filter(
         Company.manual_entry == "Yes", )
     query.update({Company.manual_entry: "manual"},
                  synchronize_session="fetch")
     session.commit()
     companies = []
     for name in query:
         name = u'update_{}'.format(name[0].lower())
         companies.append(name)
     #companies = q.get_companies_for_google_search(companies_names, force_update)
     #companies = SPLITTER.join(companies)
     logger.debug(companies)
     scrapyd_data.update(spider=GOOGLE_NAME, companies=companies)
     requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
     while True:
         resp = get_scrapyd_jobs(project_name)
         if len(resp['pending']) or len(resp['running']):
             logger.debug('{} spider still working'.format("goggle"))
             time.sleep(5)
         else:
             time.sleep(10)
             break
     logger.info('Updating resources...')
     from mx_crm.synchronizers.resource_sync import ResourceSync
     RS = ResourceSync()
     RS.sync_all()
예제 #2
0
    def update_old(self):
        get_old_wikipedia_companies()
        time.sleep(10)
        query = session.query(WikipediaDb.company_name_w,
                              WikipediaDb.wiki_url_w).filter(
                                  WikipediaDb.manual_entry == "old", )
        print(query)
        existing_names = []
        existing_urls = []
        for name in query:
            existing_names.append(name[0])
            existing_urls.append(name[1])
        project_name = 'default'
        scrapyd_data = {'project': project_name}
        import os
        s_file = sys.argv
        logger.info(s_file)
        dict_names_urls = dict(zip(existing_names, existing_urls))
        for name, url in dict_names_urls.iteritems():
            if url == u'NA':
                fixing_wrong_old_wiki(name)
            elif url == u'N/A':
                fixing_wrong_old_wiki(name)
            elif url == u'':
                fixing_wrong_old_wiki(name)
            elif url is None:
                logger.info(url)
                logger.info(name)
                fixing_wrong_old_wiki(name)
            else:
                # scrapyd_data.update(spider=WIKIPEDIA_MANUAL_NAME, companies='BKK Demag Krauss-Maffei', urls='www.bkk-dkm.de')
                scrapyd_data.update(spider=WIKIPEDIA_MANUAL_NAME,
                                    companies=name,
                                    urls=url)
                requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        #scrapyd_data.update(spider=WIKIPEDIA_MANUAL_NAME, companies=dict_names_urls.keys(), urls=dict_names_urls.values())
        requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            resp = get_scrapyd_jobs(project_name)
            try:
                if len(resp['pending']) or len(resp['running']):
                    logger.debug('{} spider still working'.format("wikipedia"))
                    time.sleep(5)
                    logger.info(resp)
                else:
                    time.sleep(10)
                    break
            except KeyError:
                if resp['status'] == u'error':
                    time.sleep(5)
                    logger.info(resp)
        logger.info('Updating resources...')
        from mx_crm.synchronizers.resource_sync import ResourceSync
        RS = ResourceSync()
        RS.wiki_sync()

        RatingUpdate().update_squirrel_rating(existing_names)
예제 #3
0
    def mass_update(self, company_name, xing_login, xing_password,
                    new_xing_url):
        xing_url = new_xing_url
        f = open("mx_crm/manual_queries/xing_url.txt", "w")
        f.write(xing_url)
        f.close()
        print('*' * 50)
        print('Start updating xing info for company {}'.format(company_name))
        query = session.query(XingCompanyDb).filter(
            XingCompanyDb.company_name_x == company_name, )
        query.update({XingCompanyDb.manual_entry: "ololo"},
                     synchronize_session="fetch")
        query.update({XingCompanyDb.xing_url: new_xing_url},
                     synchronize_session="fetch")
        session.commit()
        print('*' * 50)

        project_name = 'default'
        scrapyd_data = {'project': project_name}
        decode_company_name = u'{}'.format(company_name.decode('utf-8'))
        print decode_company_name
        company_name_lower = u'update_{}'.format(
            decode_company_name[0].lower())
        update_company_name = company_name_lower + decode_company_name[1:]
        print(update_company_name)

        companies_names = []
        force_update = True
        companies_names.append(decode_company_name.lower())

        print('Start parsing given xing url {}'.format(xing_url))
        companies = q.get_companies_for_xing(companies_names, force_update)
        companies = SPLITTER.join(companies)
        scrapyd_data.update(spider=XING_NAME,
                            companies=companies,
                            login=xing_login,
                            password=xing_password)
        requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            from mx_crm.utils import get_scrapyd_jobs
            resp = get_scrapyd_jobs(project_name)
            if not len(resp['finished']):
                time.sleep(3)
            else:
                break
        requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            from mx_crm.utils import get_scrapyd_jobs
            resp = get_scrapyd_jobs(project_name)
            if not len(resp['finished']):
                time.sleep(3)
            else:
                break
        logger.info('Updating resources...')
        from mx_crm.synchronizers.resource_sync import ResourceSync
        RS = ResourceSync()
        RS.xing_sync()
예제 #4
0
    def update_old(self):
        get_old_xing_companies()
        time.sleep(10)
        xing_login = '******'
        xing_password = '******'
        query = session.query(XingCompanyDb.company_name_x,
                              XingCompanyDb.xing_url).filter(
                                  XingCompanyDb.manual_entry == "old", )
        existing_names = []
        existing_urls = []
        for name in query:
            existing_names.append(name[0])
            existing_urls.append(name[1])
        project_name = 'default'
        scrapyd_data = {'project': project_name}

        #pprint(existing_names)

        dict_names_urls = dict(zip(existing_names, existing_urls))
        #pprint('dict_names_urls')
        #pprint(dict_names_urls)

        for name, url in dict_names_urls.iteritems():
            #pprint(url)
            if url == 'NA':
                fixing_wrong_old(name)
            if url == 'https://www.xing.com/companies':
                fixing_wrong_old(name)
            if url is None:
                fixing_wrong_old(name)
            scrapyd_data.update(
                spider=XING_MANUAL_NAME,
                companies=name,
                urls=url,
                #scrapyd_data.update(spider=XING_MANUAL_NAME, companies='AVL Iberica S.A.', urls='www.avl.de/karriere',
                login=xing_login,
                password=xing_password)
            requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)

        while True:
            resp = get_scrapyd_jobs(project_name)
            if len(resp['pending']) or len(resp['running']):
                logger.debug('{} spider still working'.format("xing"))
                time.sleep(5)
            else:
                time.sleep(10)
                break
        logger.info('Updating resources...')
        from mx_crm.synchronizers.resource_sync import ResourceSync
        RS = ResourceSync()
        RS.xing_sync()

        RatingUpdate().update_squirrel_rating(existing_names)
예제 #5
0
    def update_wikipedia_url(company_name, wikipedia_url):
        print('*' * 50)
        print(
            'Start updating wikipedia url for company {}'.format(company_name))
        print('New url is {}'.format(wikipedia_url))
        query = session.query(WikipediaDb).filter(
            WikipediaDb.company_name_w == company_name, )
        query.update({WikipediaDb.wiki_url_w: wikipedia_url},
                     synchronize_session="fetch")
        query.update({WikipediaDb.manual_entry: "Yes"},
                     synchronize_session="fetch")
        session.commit()
        print(
            'New wikipedia url ({0}) for company {1} have successful updated'.
            format(wikipedia_url, company_name))
        print('*' * 50)
        print('Start parsing page {}'.format(wikipedia_url))
        print('*' * 50)

        companies_dict = {company_name: wikipedia_url}

        print companies_dict

        project_name = 'default'
        scrapyd_data = {'project': project_name}
        decode_company_name = u'{}'.format(company_name.decode('utf-8'))
        print decode_company_name
        company_name_lower = u'update_{}'.format(
            decode_company_name[0].lower())
        update_company_name = company_name_lower + decode_company_name[1:]
        print(update_company_name)
        scrapyd_data.update(spider=WIKIPEDIA_NAME,
                            companies=update_company_name,
                            urls=wikipedia_url)
        requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)

        while True:
            from mx_crm.utils import get_scrapyd_jobs
            resp = get_scrapyd_jobs(project_name)
            print(resp)
            if len(resp['finished']) >= 1:
                break
            time.sleep(5)

        logger.info('Updating resources...')
        from mx_crm.synchronizers.resource_sync import ResourceSync
        RS = ResourceSync()
        RS.wiki_sync()
예제 #6
0
    def update_wiki_company(self, company_name, wikipedia_url):
        company_name_for_file = u'{}'.format(company_name.decode('utf-8'))
        company_name = [company_name.lower()]
        wiki_url = wikipedia_url
        f = open("mx_crm/manual_queries/wiki_url.txt", "w")
        f.write(wiki_url.encode("utf-8"))
        f.close()
        f = io.open("mx_crm/manual_queries/wiki_company_name.txt",
                    "w",
                    encoding="utf-8")
        f.write(company_name_for_file)
        f.close()

        print('*' * 50)
        print('Start updating wikipedia info for company {}'.format(
            company_name[0]))
        query = session.query(WikipediaDb).filter(
            WikipediaDb.company_name_w == company_name[0], )
        query.update({WikipediaDb.manual_entry: "manual"},
                     synchronize_session="fetch")
        session.commit()
        print('*' * 50)
        print('Start parsing given wiki url {}'.format(wiki_url))
        print('*' * 50)
        project_name = 'default'
        scrapyd_data = {'project': project_name}
        companies_dict = q.get_companies_for_wikipedia(company_name, True)
        companies = companies_dict.iterkeys()
        companies = SPLITTER.join(companies)
        urls = companies_dict.values()
        urls = SPLITTER.join(urls)
        scrapyd_data.update(spider=WIKIPEDIA_NAME,
                            companies=companies,
                            urls=urls)
        requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            resp = get_scrapyd_jobs(project_name)
            if len(resp['pending']) or len(resp['running']):
                logger.debug('{} spider still working'.format("wikipedia"))
                time.sleep(5)
            else:
                time.sleep(10)
                break
        logger.info('Updating resources...')
        from mx_crm.synchronizers.resource_sync import ResourceSync
        RS = ResourceSync()
        RS.wiki_sync()
예제 #7
0
    def update_old(self):
        # --current-date=2019-03-14 --current-time=20:00 --last-date=2019-03-08 --last-time=19:59 --spider="report"
        get_old_google_companies()
        time.sleep(10)
        query = session.query(Company.name, Company.website).filter(
            Company.manual_entry == "old", )
        existing_names = []
        existing_urls = []
        for name in query[:1]:
            existing_names.append(name[0])
            existing_urls.append(name[1])
        project_name = 'default'
        scrapyd_data = {'project': project_name}
        import os
        s_file = sys.argv
        logger.info(s_file)
        dict_names_urls = dict(zip(existing_names, existing_urls))
        for name, url in dict_names_urls.iteritems():
            companies = u'update_{}'.format(name.lower())
            logger.info(companies)
            scrapyd_data.update(spider=GOOGLE_NAME, companies=companies)
            requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            resp = get_scrapyd_jobs(project_name)
            try:
                if len(resp['pending']) or len(resp['running']):
                    logger.debug('{} spider still working'.format("google"))
                    time.sleep(5)
                    logger.info(resp)
                else:
                    time.sleep(10)
                    break
            except KeyError:
                if resp['status'] == u'error':
                    time.sleep(5)
                    logger.info(resp)
        logger.info('Updating resources...')
        from mx_crm.synchronizers.resource_sync import ResourceSync
        RS = ResourceSync()
        RS.sync_all()

        RatingUpdate().update_squirrel_rating(existing_names)
예제 #8
0
    def manual_update(self):
        query = session.query(WikipediaDb.company_name_w,
                              WikipediaDb.wiki_url_w).filter(
                                  WikipediaDb.manual_entry == "Yes", )
        existing_names = []
        existing_urls = []
        for name in query:
            existing_names.append(name[0])
            existing_urls.append(name[1])
        project_name = 'default'
        scrapyd_data = {'project': project_name}
        import os
        s_file = sys.argv
        logger.info(s_file)
        # dict_names_urls = dict(zip('sdfsdf', 'dsfsdf.com'))
        dict_names_urls = dict(zip(existing_names, existing_urls))
        for name, url in dict_names_urls.iteritems():
            scrapyd_data.update(spider=WIKIPEDIA_MANUAL_NAME,
                                companies=name,
                                urls=url)
            requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            resp = get_scrapyd_jobs(project_name)
            try:
                if len(resp['pending']) or len(resp['running']):
                    logger.debug('{} spider still working'.format("wikipedia"))
                    time.sleep(5)
                    logger.info(resp)
                else:
                    time.sleep(10)
                    break
            except KeyError:
                if resp['status'] == u'error':
                    time.sleep(5)
                    logger.info(resp)
        logger.info('Updating resources...')
        from mx_crm.synchronizers.resource_sync import ResourceSync
        RS = ResourceSync()
        RS.wiki_sync()

        RatingUpdate().update_squirrel_rating(existing_names)
예제 #9
0
    def manual_update(self):
        xing_login = '******'
        xing_password = '******'
        query = session.query(XingCompanyDb.company_name_x,
                              XingCompanyDb.xing_url).filter(
                                  XingCompanyDb.manual_entry == "Yes", )
        existing_names = []
        existing_urls = []
        for name in query:
            existing_names.append(name[0])
            existing_urls.append(name[1])
        project_name = 'default'
        scrapyd_data = {'project': project_name}

        pprint(existing_names)

        dict_names_urls = dict(zip(existing_names, existing_urls))

        for name, url in dict_names_urls.iteritems():
            #scrapyd_data.update(spider=XING_MANUAL_NAME, companies='Ckw Centralschweizerische Kraftwerke', urls='https://www.xing.com/companies/ckw',
            scrapyd_data.update(spider=XING_MANUAL_NAME,
                                companies=name,
                                urls=url,
                                login=xing_login,
                                password=xing_password)
            requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            resp = get_scrapyd_jobs(project_name)
            if len(resp['pending']) or len(resp['running']):
                logger.debug('{} spider still working'.format("xing"))
                time.sleep(5)
            else:
                time.sleep(10)
                break
        logger.info('Updating resources...')
        from mx_crm.synchronizers.resource_sync import ResourceSync
        RS = ResourceSync()
        RS.xing_sync()

        RatingUpdate().update_squirrel_rating(existing_names)
예제 #10
0
    def manual_update(self):
        query = session.query(Company.name, Company.website).filter(
            Company.manual_entry == "Yes", )
        existing_names = []
        existing_urls = []
        for name in query:
            existing_names.append(name[0])
            existing_urls.append(name[1])
        project_name = 'default'
        scrapyd_data = {'project': project_name}
        # dict_names_urls = dict(zip(['bueroservice 99 gmbh'], []))
        dict_names_urls = dict(zip(existing_names, existing_urls))
        little_list_force_update = []
        for company in existing_names:
            company = u'update_{}'.format(company)
            little_list_force_update.append(company)
        little_list_force_update = SPLITTER.join(little_list_force_update)
        logger.debug(little_list_force_update)
        scrapyd_data.update(spider=GOOGLE_NAME,
                            companies=little_list_force_update)
        requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        #for name, url in dict_names_urls.iteritems():
        #    pprint(name)
        #    scrapyd_data.update(spider=GOOGLE_NAME, companies=name)   # here used a comon google parser such as for casual parsing
        #    requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            resp = get_scrapyd_jobs(project_name)
            if len(resp['pending']) or len(resp['running']):
                logger.debug('{} spider still working'.format("google"))
                time.sleep(5)
            else:
                time.sleep(10)
                break
        logger.info('Updating resources...')
        from mx_crm.synchronizers.resource_sync import ResourceSync
        RS = ResourceSync()
        RS.sync_all()

        RatingUpdate().update_squirrel_rating(existing_names)
예제 #11
0
def sync_resources():
    RS = ResourceSync()
    RS.sync_all()