Пример #1
0
    def mass_update(self, company_name, xing_login, xing_password,
                    new_xing_url):
        xing_url = new_xing_url
        f = open("mx_crm/manual_queries/xing_url.txt", "w")
        f.write(xing_url)
        f.close()
        print('*' * 50)
        print('Start updating xing info for company {}'.format(company_name))
        query = session.query(XingCompanyDb).filter(
            XingCompanyDb.company_name_x == company_name, )
        query.update({XingCompanyDb.manual_entry: "ololo"},
                     synchronize_session="fetch")
        query.update({XingCompanyDb.xing_url: new_xing_url},
                     synchronize_session="fetch")
        session.commit()
        print('*' * 50)

        project_name = 'default'
        scrapyd_data = {'project': project_name}
        decode_company_name = u'{}'.format(company_name.decode('utf-8'))
        print decode_company_name
        company_name_lower = u'update_{}'.format(
            decode_company_name[0].lower())
        update_company_name = company_name_lower + decode_company_name[1:]
        print(update_company_name)

        companies_names = []
        force_update = True
        companies_names.append(decode_company_name.lower())

        print('Start parsing given xing url {}'.format(xing_url))
        companies = q.get_companies_for_xing(companies_names, force_update)
        companies = SPLITTER.join(companies)
        scrapyd_data.update(spider=XING_NAME,
                            companies=companies,
                            login=xing_login,
                            password=xing_password)
        requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            from mx_crm.utils import get_scrapyd_jobs
            resp = get_scrapyd_jobs(project_name)
            if not len(resp['finished']):
                time.sleep(3)
            else:
                break
        requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            from mx_crm.utils import get_scrapyd_jobs
            resp = get_scrapyd_jobs(project_name)
            if not len(resp['finished']):
                time.sleep(3)
            else:
                break
        logger.info('Updating resources...')
        from mx_crm.synchronizers.resource_sync import ResourceSync
        RS = ResourceSync()
        RS.xing_sync()
Пример #2
0
 def mass_evaluation(self):
     project_name = 'default'
     scrapyd_data = {'project': project_name}
     force_update = True
     query = session.query(Company.name).filter(
         Company.manual_entry == "Yes", )
     query.update({Company.manual_entry: "manual"},
                  synchronize_session="fetch")
     session.commit()
     companies = []
     for name in query:
         name = u'update_{}'.format(name[0].lower())
         companies.append(name)
     #companies = q.get_companies_for_google_search(companies_names, force_update)
     #companies = SPLITTER.join(companies)
     logger.debug(companies)
     scrapyd_data.update(spider=GOOGLE_NAME, companies=companies)
     requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
     while True:
         resp = get_scrapyd_jobs(project_name)
         if len(resp['pending']) or len(resp['running']):
             logger.debug('{} spider still working'.format("goggle"))
             time.sleep(5)
         else:
             time.sleep(10)
             break
     logger.info('Updating resources...')
     from mx_crm.synchronizers.resource_sync import ResourceSync
     RS = ResourceSync()
     RS.sync_all()
Пример #3
0
def tet():
    force_update = True
    company_name = ['50Hertz Transmission GmbH'.decode('utf-8')]
    company_name = map(lambda c: c.lower(), company_name)
    logger.debug('Found companies: {}'.format(company_name))
    updated_name = u'update_{}'.format(company_name[0])
    companies = []
    companies.append(updated_name)
    url = "https://de.wikipedia.org/wiki/50Hertz_Transmission"
    #   urls = []
    #   urls.append(url)
    print updated_name
    project_name = 'default'
    scrapyd_data = {'project': project_name}
    companies_dict = q.get_companies_for_wikipedia(company_name, force_update)
    logger.debug(companies_dict)
    companies = companies_dict.iterkeys()
    companies = SPLITTER.join(companies)
    logger.debug(companies)
    urls = companies_dict.itervalues()
    scrapyd_data.update(spider=WIKIPEDIA_NAME, companies=companies, urls=urls)
    print scrapyd_data
    requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
    while True:
        resp = get_scrapyd_jobs(project_name)
        if not len(resp['finished']):
            time.sleep(3)
        else:
            break
Пример #4
0
    def update_old(self):
        get_old_wikipedia_companies()
        time.sleep(10)
        query = session.query(WikipediaDb.company_name_w,
                              WikipediaDb.wiki_url_w).filter(
                                  WikipediaDb.manual_entry == "old", )
        print(query)
        existing_names = []
        existing_urls = []
        for name in query:
            existing_names.append(name[0])
            existing_urls.append(name[1])
        project_name = 'default'
        scrapyd_data = {'project': project_name}
        import os
        s_file = sys.argv
        logger.info(s_file)
        dict_names_urls = dict(zip(existing_names, existing_urls))
        for name, url in dict_names_urls.iteritems():
            if url == u'NA':
                fixing_wrong_old_wiki(name)
            elif url == u'N/A':
                fixing_wrong_old_wiki(name)
            elif url == u'':
                fixing_wrong_old_wiki(name)
            elif url is None:
                logger.info(url)
                logger.info(name)
                fixing_wrong_old_wiki(name)
            else:
                # scrapyd_data.update(spider=WIKIPEDIA_MANUAL_NAME, companies='BKK Demag Krauss-Maffei', urls='www.bkk-dkm.de')
                scrapyd_data.update(spider=WIKIPEDIA_MANUAL_NAME,
                                    companies=name,
                                    urls=url)
                requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        #scrapyd_data.update(spider=WIKIPEDIA_MANUAL_NAME, companies=dict_names_urls.keys(), urls=dict_names_urls.values())
        requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            resp = get_scrapyd_jobs(project_name)
            try:
                if len(resp['pending']) or len(resp['running']):
                    logger.debug('{} spider still working'.format("wikipedia"))
                    time.sleep(5)
                    logger.info(resp)
                else:
                    time.sleep(10)
                    break
            except KeyError:
                if resp['status'] == u'error':
                    time.sleep(5)
                    logger.info(resp)
        logger.info('Updating resources...')
        from mx_crm.synchronizers.resource_sync import ResourceSync
        RS = ResourceSync()
        RS.wiki_sync()

        RatingUpdate().update_squirrel_rating(existing_names)
Пример #5
0
    def update_old(self):
        get_old_xing_companies()
        time.sleep(10)
        xing_login = '******'
        xing_password = '******'
        query = session.query(XingCompanyDb.company_name_x,
                              XingCompanyDb.xing_url).filter(
                                  XingCompanyDb.manual_entry == "old", )
        existing_names = []
        existing_urls = []
        for name in query:
            existing_names.append(name[0])
            existing_urls.append(name[1])
        project_name = 'default'
        scrapyd_data = {'project': project_name}

        #pprint(existing_names)

        dict_names_urls = dict(zip(existing_names, existing_urls))
        #pprint('dict_names_urls')
        #pprint(dict_names_urls)

        for name, url in dict_names_urls.iteritems():
            #pprint(url)
            if url == 'NA':
                fixing_wrong_old(name)
            if url == 'https://www.xing.com/companies':
                fixing_wrong_old(name)
            if url is None:
                fixing_wrong_old(name)
            scrapyd_data.update(
                spider=XING_MANUAL_NAME,
                companies=name,
                urls=url,
                #scrapyd_data.update(spider=XING_MANUAL_NAME, companies='AVL Iberica S.A.', urls='www.avl.de/karriere',
                login=xing_login,
                password=xing_password)
            requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)

        while True:
            resp = get_scrapyd_jobs(project_name)
            if len(resp['pending']) or len(resp['running']):
                logger.debug('{} spider still working'.format("xing"))
                time.sleep(5)
            else:
                time.sleep(10)
                break
        logger.info('Updating resources...')
        from mx_crm.synchronizers.resource_sync import ResourceSync
        RS = ResourceSync()
        RS.xing_sync()

        RatingUpdate().update_squirrel_rating(existing_names)
Пример #6
0
    def update_wikipedia_url(company_name, wikipedia_url):
        print('*' * 50)
        print(
            'Start updating wikipedia url for company {}'.format(company_name))
        print('New url is {}'.format(wikipedia_url))
        query = session.query(WikipediaDb).filter(
            WikipediaDb.company_name_w == company_name, )
        query.update({WikipediaDb.wiki_url_w: wikipedia_url},
                     synchronize_session="fetch")
        query.update({WikipediaDb.manual_entry: "Yes"},
                     synchronize_session="fetch")
        session.commit()
        print(
            'New wikipedia url ({0}) for company {1} have successful updated'.
            format(wikipedia_url, company_name))
        print('*' * 50)
        print('Start parsing page {}'.format(wikipedia_url))
        print('*' * 50)

        companies_dict = {company_name: wikipedia_url}

        print companies_dict

        project_name = 'default'
        scrapyd_data = {'project': project_name}
        decode_company_name = u'{}'.format(company_name.decode('utf-8'))
        print decode_company_name
        company_name_lower = u'update_{}'.format(
            decode_company_name[0].lower())
        update_company_name = company_name_lower + decode_company_name[1:]
        print(update_company_name)
        scrapyd_data.update(spider=WIKIPEDIA_NAME,
                            companies=update_company_name,
                            urls=wikipedia_url)
        requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)

        while True:
            from mx_crm.utils import get_scrapyd_jobs
            resp = get_scrapyd_jobs(project_name)
            print(resp)
            if len(resp['finished']) >= 1:
                break
            time.sleep(5)

        logger.info('Updating resources...')
        from mx_crm.synchronizers.resource_sync import ResourceSync
        RS = ResourceSync()
        RS.wiki_sync()
Пример #7
0
 def google_update(self, google_names_urls):
     project_name = 'default'
     scrapyd_data = {'project': project_name}
     for name, url in google_names_urls.iteritems():
         scrapyd_data.update(
             spider=GOOGLE_NAME, companies=name
         )  # here used a comon google parser such as for casual parsing
         requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
     while True:
         resp = get_scrapyd_jobs(project_name)
         if len(resp['pending']) or len(resp['running']):
             logger.debug('{} spider still working'.format("google"))
             time.sleep(5)
         else:
             time.sleep(10)
             break
Пример #8
0
    def update_wiki_company(self, company_name, wikipedia_url):
        company_name_for_file = u'{}'.format(company_name.decode('utf-8'))
        company_name = [company_name.lower()]
        wiki_url = wikipedia_url
        f = open("mx_crm/manual_queries/wiki_url.txt", "w")
        f.write(wiki_url.encode("utf-8"))
        f.close()
        f = io.open("mx_crm/manual_queries/wiki_company_name.txt",
                    "w",
                    encoding="utf-8")
        f.write(company_name_for_file)
        f.close()

        print('*' * 50)
        print('Start updating wikipedia info for company {}'.format(
            company_name[0]))
        query = session.query(WikipediaDb).filter(
            WikipediaDb.company_name_w == company_name[0], )
        query.update({WikipediaDb.manual_entry: "manual"},
                     synchronize_session="fetch")
        session.commit()
        print('*' * 50)
        print('Start parsing given wiki url {}'.format(wiki_url))
        print('*' * 50)
        project_name = 'default'
        scrapyd_data = {'project': project_name}
        companies_dict = q.get_companies_for_wikipedia(company_name, True)
        companies = companies_dict.iterkeys()
        companies = SPLITTER.join(companies)
        urls = companies_dict.values()
        urls = SPLITTER.join(urls)
        scrapyd_data.update(spider=WIKIPEDIA_NAME,
                            companies=companies,
                            urls=urls)
        requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            resp = get_scrapyd_jobs(project_name)
            if len(resp['pending']) or len(resp['running']):
                logger.debug('{} spider still working'.format("wikipedia"))
                time.sleep(5)
            else:
                time.sleep(10)
                break
        logger.info('Updating resources...')
        from mx_crm.synchronizers.resource_sync import ResourceSync
        RS = ResourceSync()
        RS.wiki_sync()
Пример #9
0
 def import_companies_update(self):
     old_companies = q.get_imported_companies_older_than_one_year()
     pprint(old_companies[:2])
     little_list = old_companies[:2]
     project_name = 'default'
     scrapyd_data = {'project': project_name}
     for name in little_list:
         scrapyd_data.update(
             spider=GOOGLE_NAME, companies=name
         )  # here used a comon google parser such as for casual parsing
         requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
     while True:
         resp = get_scrapyd_jobs(project_name)
         if len(resp['pending']) or len(resp['running']):
             logger.debug('{} spider still working'.format("google"))
             time.sleep(5)
         else:
             time.sleep(10)
             break
Пример #10
0
def wikipedia_manual(single_name, single_url, file, force_update):
    parse_data = {}
    companies_names = []

    if single_name and single_url:
        parse_data[single_name] = single_url
        companies_names.append(single_name)
    if file:
        try:
            wiki_wb = load_workbook(filename=file)
        except IOError:
            raise Exception('File "%s" does not exist!!!' % file)
        wiki_ws = wiki_wb.worksheets[0]
        rows = wiki_ws.rows
        rows.next()
        for row in rows:
            key = u'update_' + row[0].value if force_update else row[0].value
            parse_data[key] = row[1].value
            companies_names.append(row[0].value)

    project_name = 'default'
    json_path = json_data_path('manual_wiki_data.json')
    scrapyd_data = {'project': project_name}
    scrapyd_data.update(spider=WIKIPEDIA_NAME,
                        json_data=json_path,
                        is_manual_update_wiki=True)
    with open(json_path, 'w') as f:
        f.write(json.dumps({'manual_data': parse_data}))
    requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)

    while True:
        resp = get_scrapyd_jobs(project_name)
        if len(resp['finished']) >= 1:
            break
        time.sleep(5)

    logger.info('Updating resources...')
    sync_resources()

    if parse_data:
        logger.info('Creating all companies report...')
        all_companies_main(companies_names)
Пример #11
0
    def update_old(self):
        # --current-date=2019-03-14 --current-time=20:00 --last-date=2019-03-08 --last-time=19:59 --spider="report"
        get_old_google_companies()
        time.sleep(10)
        query = session.query(Company.name, Company.website).filter(
            Company.manual_entry == "old", )
        existing_names = []
        existing_urls = []
        for name in query[:1]:
            existing_names.append(name[0])
            existing_urls.append(name[1])
        project_name = 'default'
        scrapyd_data = {'project': project_name}
        import os
        s_file = sys.argv
        logger.info(s_file)
        dict_names_urls = dict(zip(existing_names, existing_urls))
        for name, url in dict_names_urls.iteritems():
            companies = u'update_{}'.format(name.lower())
            logger.info(companies)
            scrapyd_data.update(spider=GOOGLE_NAME, companies=companies)
            requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            resp = get_scrapyd_jobs(project_name)
            try:
                if len(resp['pending']) or len(resp['running']):
                    logger.debug('{} spider still working'.format("google"))
                    time.sleep(5)
                    logger.info(resp)
                else:
                    time.sleep(10)
                    break
            except KeyError:
                if resp['status'] == u'error':
                    time.sleep(5)
                    logger.info(resp)
        logger.info('Updating resources...')
        from mx_crm.synchronizers.resource_sync import ResourceSync
        RS = ResourceSync()
        RS.sync_all()

        RatingUpdate().update_squirrel_rating(existing_names)
Пример #12
0
def aaa():
    company_name = ["Techem Energy Services GmbH"]
    project_name = 'default'
    scrapyd_data = {'project': project_name}
    companies = q.get_companies_for_xing(company_name, True)
    companies = SPLITTER.join(companies)
    logger.debug(companies)
    #scrapyd_data.update(spider="xing", companies=companies, login="******", password="******")
    scrapyd_data.update(spider=XING_NAME,
                        companies=companies,
                        login="******",
                        password="******")
    logger.info(scrapyd_data)
    requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
    while True:
        resp = get_scrapyd_jobs(project_name)
        if not len(resp['finished']):
            time.sleep(3)
        else:
            break
Пример #13
0
    def manual_update(self):
        query = session.query(WikipediaDb.company_name_w,
                              WikipediaDb.wiki_url_w).filter(
                                  WikipediaDb.manual_entry == "Yes", )
        existing_names = []
        existing_urls = []
        for name in query:
            existing_names.append(name[0])
            existing_urls.append(name[1])
        project_name = 'default'
        scrapyd_data = {'project': project_name}
        import os
        s_file = sys.argv
        logger.info(s_file)
        # dict_names_urls = dict(zip('sdfsdf', 'dsfsdf.com'))
        dict_names_urls = dict(zip(existing_names, existing_urls))
        for name, url in dict_names_urls.iteritems():
            scrapyd_data.update(spider=WIKIPEDIA_MANUAL_NAME,
                                companies=name,
                                urls=url)
            requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            resp = get_scrapyd_jobs(project_name)
            try:
                if len(resp['pending']) or len(resp['running']):
                    logger.debug('{} spider still working'.format("wikipedia"))
                    time.sleep(5)
                    logger.info(resp)
                else:
                    time.sleep(10)
                    break
            except KeyError:
                if resp['status'] == u'error':
                    time.sleep(5)
                    logger.info(resp)
        logger.info('Updating resources...')
        from mx_crm.synchronizers.resource_sync import ResourceSync
        RS = ResourceSync()
        RS.wiki_sync()

        RatingUpdate().update_squirrel_rating(existing_names)
Пример #14
0
    def manual_update(self):
        xing_login = '******'
        xing_password = '******'
        query = session.query(XingCompanyDb.company_name_x,
                              XingCompanyDb.xing_url).filter(
                                  XingCompanyDb.manual_entry == "Yes", )
        existing_names = []
        existing_urls = []
        for name in query:
            existing_names.append(name[0])
            existing_urls.append(name[1])
        project_name = 'default'
        scrapyd_data = {'project': project_name}

        pprint(existing_names)

        dict_names_urls = dict(zip(existing_names, existing_urls))

        for name, url in dict_names_urls.iteritems():
            #scrapyd_data.update(spider=XING_MANUAL_NAME, companies='Ckw Centralschweizerische Kraftwerke', urls='https://www.xing.com/companies/ckw',
            scrapyd_data.update(spider=XING_MANUAL_NAME,
                                companies=name,
                                urls=url,
                                login=xing_login,
                                password=xing_password)
            requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            resp = get_scrapyd_jobs(project_name)
            if len(resp['pending']) or len(resp['running']):
                logger.debug('{} spider still working'.format("xing"))
                time.sleep(5)
            else:
                time.sleep(10)
                break
        logger.info('Updating resources...')
        from mx_crm.synchronizers.resource_sync import ResourceSync
        RS = ResourceSync()
        RS.xing_sync()

        RatingUpdate().update_squirrel_rating(existing_names)
Пример #15
0
def lll():
    company_name = ["element GmbH".lower()]
    project_name = 'default'
    scrapyd_data = {'project': project_name}
    companies_dict = q.get_companies_for_wikipedia(company_name, True)
    companies = companies_dict.iterkeys()
    companies = SPLITTER.join(companies)
    urls = companies_dict.values()
    urls = SPLITTER.join(urls)
    print urls
    print companies
    scrapyd_data.update(spider=WIKIPEDIA_NAME, companies=companies, urls=urls)
    print scrapyd_data
    requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
    while True:
        resp = get_scrapyd_jobs(project_name)
        if len(resp['pending']) or len(resp['running']):
            logger.debug('{} spider still working'.format("wikipedia"))
            time.sleep(5)
        else:
            time.sleep(10)
            break
Пример #16
0
    def manual_update(self):
        query = session.query(Company.name, Company.website).filter(
            Company.manual_entry == "Yes", )
        existing_names = []
        existing_urls = []
        for name in query:
            existing_names.append(name[0])
            existing_urls.append(name[1])
        project_name = 'default'
        scrapyd_data = {'project': project_name}
        # dict_names_urls = dict(zip(['bueroservice 99 gmbh'], []))
        dict_names_urls = dict(zip(existing_names, existing_urls))
        little_list_force_update = []
        for company in existing_names:
            company = u'update_{}'.format(company)
            little_list_force_update.append(company)
        little_list_force_update = SPLITTER.join(little_list_force_update)
        logger.debug(little_list_force_update)
        scrapyd_data.update(spider=GOOGLE_NAME,
                            companies=little_list_force_update)
        requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        #for name, url in dict_names_urls.iteritems():
        #    pprint(name)
        #    scrapyd_data.update(spider=GOOGLE_NAME, companies=name)   # here used a comon google parser such as for casual parsing
        #    requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            resp = get_scrapyd_jobs(project_name)
            if len(resp['pending']) or len(resp['running']):
                logger.debug('{} spider still working'.format("google"))
                time.sleep(5)
            else:
                time.sleep(10)
                break
        logger.info('Updating resources...')
        from mx_crm.synchronizers.resource_sync import ResourceSync
        RS = ResourceSync()
        RS.sync_all()

        RatingUpdate().update_squirrel_rating(existing_names)
Пример #17
0
def test(single_name, single_url, force_update):
    parse_data = {}
    companies_names = []

    if single_name and single_url:
        parse_data[single_name] = single_url
        companies_names.append(single_name)

    project_name = 'default'
    scrapyd_data = {'project': project_name}

    scrapyd_data.update(spider=WIKIPEDIA_NAME, is_manual_update_wiki=True)

    requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)

    while True:
        resp = get_scrapyd_jobs(project_name)
        if len(resp['finished']) >= 1:
            break
        time.sleep(5)

    logger.info('Updating resources...')
    sync_resources()
Пример #18
0
 def tet():
     project_name = 'default'
     scrapyd_data = {'project': project_name}
     companies_names = []
     c_name = "50Hertz Transmission GmbH"
     companies_names.append(c_name)
     force_update = True
     companies_dict = q.get_companies_for_wikipedia(companies_names,
                                                    force_update)
     logger.debug(companies_dict)
     companies = companies_dict.iterkeys()
     companies = SPLITTER.join(companies)
     logger.debug(companies)
     urls = companies_dict.itervalues()
     scrapyd_data.update(spider=WIKIPEDIA_NAME,
                         companies=companies,
                         urls=urls)
     requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
     while True:
         resp = get_scrapyd_jobs(project_name)
         if not len(resp['finished']):
             time.sleep(3)
         else:
             break
Пример #19
0
    def update_info():
        companies_names = ['50Hertz Transmission GmbH'.decode('utf-8')]
        companies_names = map(lambda c: c.lower(), companies_names)
        logger.debug('Found companies: {}'.format(companies_names))

        project_name = 'default'
        scrapyd_data = {'project': project_name}
        force_update = True
        spider = 'wikipedia'
        xing_login = "******"
        xing_password = "******"

        if spider == "wikipedia":
            companies_dict = q.get_companies_for_wikipedia(
                companies_names, force_update)
            logger.debug(companies_dict)
            companies = companies_dict.iterkeys()
            companies = SPLITTER.join(companies)
            logger.debug(companies)
            urls = list(companies_dict.itervalues())
            print companies_dict

            scrapyd_data.update(spider=WIKIPEDIA_NAME, companies=companies)
            print "@@@@@@@@@@@@@@@@@@@"
            print scrapyd_data

            requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
            while True:
                resp = get_scrapyd_jobs(project_name)
                if not len(resp['finished']):
                    time.sleep(3)
                else:
                    break
        elif spider == "xing":
            companies = q.get_companies_for_xing(companies_names, force_update)
            companies = SPLITTER.join(companies)
            logger.debug(companies)
            scrapyd_data.update(spider=XING_NAME,
                                companies=companies,
                                login=xing_login,
                                password=xing_password)
            requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
            while True:
                resp = get_scrapyd_jobs(project_name)
                if not len(resp['finished']):
                    time.sleep(3)
                else:
                    break
        else:
            spiders = OrderedDict()
            spiders[WIKIPEDIA_NAME] = []
            spiders[XING_NAME] = q.get_companies_for_xing(
                companies_names, force_update)

            for spider_name, companies in spiders.items():
                if spider_name == WIKIPEDIA_NAME:
                    companies = q.get_companies_for_wikipedia(
                        companies_names, force_update)
                logger.debug('{} spider has started'.format(spider_name))
                logger.debug(companies)
                post_data = scrapyd_data.copy()
                post_data.update(spider=spider_name)
                if spider_name == WIKIPEDIA_NAME:
                    companies_dict = companies.copy()
                    companies = SPLITTER.join(companies_dict.iterkeys())
                    urls = SPLITTER.join(companies_dict.itervalues())
                    post_data.update(companies=companies, urls=urls)
                else:
                    companies = SPLITTER.join(companies)
                    post_data.update(companies=companies,
                                     login=xing_login,
                                     password=xing_password)
                requests.post(SCRAPYD_SCHEDULE_URL, post_data)

                while True:
                    resp = get_scrapyd_jobs(project_name)
                    if len(resp['pending']) or len(resp['running']):
                        logger.debug(
                            '{} spider still working'.format(spider_name))
                        time.sleep(5)
                    else:
                        time.sleep(10)
                        break

            while True:
                resp = get_scrapyd_jobs(project_name)
                if len(resp['finished']) < len(spiders):
                    time.sleep(3)
                else:
                    break

        logger.info('Updating resources...')
        sync_resources()
Пример #20
0
def main(days, allow_import, force_update, import_file, db_update, spider,
         xing_login, xing_password, **kwargs):
    """
    Main function.
    :param days: days to extract requests from
    :param force_update: force update companies info in database from spiders
    :param allow_import: allows import
    :param import_file: path to xlsx file with the list of companies
    :param db_update: update info for all database companies
    :param spider: spider name
    :param xing_login: username/email for xing login
    :param xing_password: password for xing login
    """
    logger.info("Synchronize accesslogs with remote DB.")
    if os.name == 'nt':
        accesslog_sync()

    drupal_companies = None

    if allow_import:
        companies_names = XlsxImport(import_file,
                                     force_update=force_update).run()
        logger.info(
            "IIIIIIIIMMMMMMMMPPPPPPPPPOOOOOOOOOORRRRRRRRRRRRTTTTTTTTTT")
        logger.info(companies_names)
    elif db_update:
        force_update = True
        companies_names = q.get_all_companies_names()
        #companies_names = ['eto gruppe beteiligungen kg'.decode('utf-8')]
    else:
        q.update_db_hosts()
        start_date, end_date = prepare_date_to_drupal_execute(days, **kwargs)
        google_analytics_companies = q.get_google_analytics_sessions(
            end_date, start_date, True)
        drupal_companies = q.get_drupal_sessions(end_date, start_date)
        companies_names = drupal_companies.keys()
        dates = {'end_date': end_date, 'start_date': start_date}
        #
        #companies_names = ['eto gruppe beteiligungen kg'.decode('utf-8')]

    companies_names = map(lambda c: c.lower(), companies_names)
    logger.debug('Found companies: {}'.format(companies_names))

    project_name = 'default'
    scrapyd_data = {'project': project_name}
    if spider == GOOGLE_NAME:
        companies = q.get_companies_for_google_search(companies_names,
                                                      force_update)
        logger.info(companies)
        companies = SPLITTER.join(companies)
        logger.debug(companies)
        scrapyd_data.update(spider=GOOGLE_NAME, companies=companies)
        requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            resp = get_scrapyd_jobs(project_name)
            logger.info(resp)
            if not len(resp['finished']):
                time.sleep(3)
            else:
                break

    elif spider == GOOGLE_IMPORT:
        companies = q.get_imported_companies_older_than_one_year()
        little_list = companies[:75]
        little_list_force_update = []
        for company in little_list:
            company = u'update_{}'.format(company)
            little_list_force_update.append(company)
        logger.debug(little_list_force_update)
        little_list_force_update = SPLITTER.join(little_list_force_update)
        logger.debug(little_list_force_update)
        scrapyd_data.update(spider=GOOGLE_NAME,
                            companies=little_list_force_update)
        requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            resp = get_scrapyd_jobs(project_name)
            logger.info(resp)
            if not len(resp['finished']):
                time.sleep(3)
            else:
                break
    elif spider == WIKIPEDIA_NAME:
        companies_dict = q.get_companies_for_wikipedia(companies_names,
                                                       force_update)
        logger.debug(companies_dict)
        companies = companies_dict.iterkeys()
        companies = SPLITTER.join(companies)
        logger.debug(companies)
        urls = companies_dict.itervalues()
        scrapyd_data.update(spider=WIKIPEDIA_NAME,
                            companies=companies,
                            urls=urls)
        requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            resp = get_scrapyd_jobs(project_name)
            if not len(resp['finished']):
                time.sleep(3)
            else:
                break
    elif spider == WIKIPEDIA_FIXING:
        companies_names = q.get_manual_wikipedia_companies()
        force_update_companies = []
        for company in companies_names:
            company = u'update_{}'.format(company)
            force_update_companies.append(company)
        logger.debug(force_update_companies)
        force_update_companies = SPLITTER.join(force_update_companies)
        logger.debug(force_update_companies)
        urls = q.get_websites_wikipedia(companies_names)
        q.set_wikipedia_manual_entry_manual(companies_names)
        scrapyd_data.update(spider=WIKIPEDIA_NAME,
                            companies=force_update_companies,
                            urls=urls)
        requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            resp = get_scrapyd_jobs(project_name)
            if not len(resp['finished']):
                time.sleep(3)
            else:
                break
    elif spider == XING_NAME:
        companies = q.get_companies_for_xing(companies_names, force_update)
        companies = SPLITTER.join(companies)
        logger.debug(companies)
        scrapyd_data.update(spider=XING_NAME,
                            companies=companies,
                            login=xing_login,
                            password=xing_password)
        requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data)
        while True:
            resp = get_scrapyd_jobs(project_name)
            if not len(resp['finished']):
                time.sleep(3)
            else:
                break
    else:
        spiders = OrderedDict()
        spiders[GOOGLE_NAME] = q.get_companies_for_google_search(
            companies_names, force_update)
        spiders[WIKIPEDIA_NAME] = []
        # spiders[XING_NAME] = q.get_companies_for_xing([u'amazon.com inc.'], force_update)
        spiders[XING_NAME] = q.get_companies_for_xing(companies_names,
                                                      force_update)
        # update_google_analytics_companies = force_update_google_analytics_companies(google_analytics_companies)
        google_name_list = list(spiders[GOOGLE_NAME])
        # google_name_list += update_google_analytics_companies
        # spiders[GOOGLE_NAME] = ([u'update_amazon.com inc.'])
        spiders[GOOGLE_NAME] = (google_name_list)
        # print(spiders[GOOGLE_NAME])

        for spider_name, companies in spiders.items():
            logger.info(spider_name)
            if spider_name == WIKIPEDIA_NAME:
                # companies = q.get_companies_for_wikipedia([u'amazon.com inc.'], force_update)
                companies = q.get_companies_for_wikipedia(
                    companies_names, force_update)
            logger.debug('{} spider has started'.format(spider_name))
            logger.debug(companies)
            post_data = scrapyd_data.copy()
            post_data.update(spider=spider_name)
            if spider_name == WIKIPEDIA_NAME:
                companies_dict = companies.copy()
                companies = SPLITTER.join(companies_dict.iterkeys())
                urls = SPLITTER.join(companies_dict.itervalues())
                post_data.update(companies=companies, urls=urls)
            else:
                companies = SPLITTER.join(companies)
                post_data.update(companies=companies,
                                 login=xing_login,
                                 password=xing_password)
            requests.post(SCRAPYD_SCHEDULE_URL, post_data)

            while True:
                resp = get_scrapyd_jobs(project_name)
                if len(resp['pending']) or len(resp['running']):
                    logger.debug('{} spider still working'.format(spider_name))
                    time.sleep(5)
                else:
                    time.sleep(10)
                    break

        while True:
            resp = get_scrapyd_jobs(project_name)
            if len(resp['finished']) < len(spiders):
                time.sleep(3)
            else:
                break

    logger.info('Updating resources...')
    sync_resources()

    if drupal_companies:
        logger.info('Creating drupal report...')
        me = MatchExecutor()
        me.create_report(drupal_companies,
                         google_analytics_companies,
                         dates=dates)
    elif allow_import and companies_names:
        logger.info('Creating all companies report...')
        all_companies_main(companies_names)