def update_squirrel_rating(): companies_names_xing = ['thuega meteringservice gmbh'] for name in companies_names_xing: pprint(name) query_x_url = session.query(XingCompanyDb.company_name_x, XingCompanyDb.xing_url).filter( XingCompanyDb.company_name_x == name, ) try: xing_url = query_x_url[0][1] except IndexError: xing_url = u'' pprint(xing_url) if xing_url != u'': query = session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x == name, ) pprint("zazaz") else: query_x_p = session.query( Company.xing_page).filter(Company.name == name) xing_page = query_x_p[0][0] pprint(xing_page) query = session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x == name, ) query.update({XingCompanyDb.xing_url: xing_page}, synchronize_session="fetch") session.commit()
def get_companies_for_xing(companies, force_update): """ Prepares list of companies' names for searching in xing :param companies: List of companies that made requests during specified range :param force_update: force update companies info in database from spiders """ existing_entries = session.query(XingCompanyDb).join( Company, Company.id == XingCompanyDb.xc_id).filter( Company.name.in_(companies), Company.xing_page != 'NA', Company.xing_page is not None, ) existing_objects_by_name = set( session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x.in_(companies))) to_delete_ids = { c.x_id for c in existing_objects_by_name - set(existing_entries) } if to_delete_ids: session.query(XingCompanyDb).filter( XingCompanyDb.x_id.in_(to_delete_ids)).delete( synchronize_session='fetch') session.commit() existing_names = { entry.company_name_x.lower() for entry in existing_entries } res = set(companies) - existing_names if force_update: res.update({u'update_' + name for name in existing_names}) return res
def recalculate_per(self, timestamp=settings.TWO_WEEKS_AGO): self._load_accesslogs_timestamps_to_memory(timestamp=timestamp) update_list = [] for index, item in enumerate( self.get_db_ip(hosts=self.accesslogs_timestamps.keys())): timestamps = self.filter_accesslogs_timestamp(item) session_total = self.session_total_by_host(timestamps) update_dict = { 'ip_id': item.ip_id, 'total_session_length': (item.total_session_length or 0) + session_total.get('time', 0), 'total_visit_count': (item.total_visit_count or 0) + session_total.get('visited', 0), 'last_total_update': session_total.get('last_timestamp', time.time()), } update_list.append(update_dict) if index and index % 5000 == 0: session.bulk_update_mappings(DbIpDatabase, update_list) session.commit() update_list = [] logger.info('Updated %s records.' % str(index)) session.bulk_update_mappings(DbIpDatabase, update_list) session.commit() logger.info('Updated %s records.' % str(index)) self._log_update({'total_fields_last_calculation': time.time()})
def main(): """ looks up the maximum timestamp of squirrel and imports the data since then from drupal accesslog, make sure you are not connected to the T-mobile stick or the database connection to drupal will fail """ logger.info("Start synchronize accesslogs.") start_time = time.time() logger.info("Get max current timestamp.") local_accesslog = session.query(func.max(Accesslog.timestamp)).first() local_accesslog = local_accesslog[0] if local_accesslog else None if not local_accesslog: return logger.info("Get all new accesslogs.") drupal_accesslogs = drupal_session.query(Accesslog).filter( Accesslog.timestamp > local_accesslog) logger.info("Build bulk insert query.") session.bulk_insert_mappings(Accesslog, [ dict(aid=i.aid, sid=i.sid, title=i.title, path=i.path, url=i.url, hostname=i.hostname, uid=i.uid, timer=i.timer, timestamp=i.timestamp) for i in drupal_accesslogs ]) session.commit() logger.info("Data loaded in %s seconds. Count: %s" % (str(time.time() - start_time), drupal_accesslogs.count()))
def process_item(self, item, spider): logging.info("!!!!!!!!!!ITEM!!!!!!!!!!!!") logging.info(item) company_name = item['company_name'] company_website = item['wiki_company_website'] headquarters = item.get('sitz', '')[:50] if item.get('sitz') else None if item.get('wiki_company_website') and len( item['wiki_company_website']) > 130: parsed_url = urlparse.urlparse(item['wiki_company_website']) item['wiki_company_website'] = '{protocol}://{hostname}'.format( protocol=parsed_url.scheme, hostname=parsed_url.hostname) item = dict(summary_wikipedia_w=item['summary'], categories_wikipedia_w=item['categories'], revenue_wikipedia_w=item.get('revenue', ''), revenue_currency_wiki_w=item.get('currency', ''), branch_wikipedia_w=item.get('branche', ''), wiki_url_w=item['company_website'], headquarters_wiki_w=headquarters, employees_wikipedia_w=item.get('mitarbeiter', ''), company_website_w=item.get('wiki_company_website', ''), last_update_w=func.now()) query = session.query(WikipediaDb).filter( WikipediaDb.company_name_w == company_name, ) # wiki_company.update(item, synchronize_session='fetch') query.update(item, synchronize_session=False) if query[0].manual_entry == "old": query.update({WikipediaDb.manual_entry: "No"}, synchronize_session="fetch") else: query.update({WikipediaDb.manual_entry: "manual"}, synchronize_session="fetch") session.commit()
def mass_evaluation(self): project_name = 'default' scrapyd_data = {'project': project_name} force_update = True query = session.query(Company.name).filter( Company.manual_entry == "Yes", ) query.update({Company.manual_entry: "manual"}, synchronize_session="fetch") session.commit() companies = [] for name in query: name = u'update_{}'.format(name[0].lower()) companies.append(name) #companies = q.get_companies_for_google_search(companies_names, force_update) #companies = SPLITTER.join(companies) logger.debug(companies) scrapyd_data.update(spider=GOOGLE_NAME, companies=companies) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("goggle")) time.sleep(5) else: time.sleep(10) break logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.sync_all()
def fixing_wrong_old_wiki(name): query = session.query(WikipediaDb).filter( WikipediaDb.company_name_w == name, ) query.update({WikipediaDb.manual_entry: "No"}, synchronize_session="fetch") query.update({WikipediaDb.last_update_w: func.now()}, synchronize_session="fetch") session.commit()
def xing_update(self, xing_names_urls): project_name = 'default' scrapyd_data = {'project': project_name} xing_login = '******' xing_password = '******' for name, url in xing_names_urls.iteritems(): if url != u'' or u'N/A': #scrapyd_data.update(spider=XING_MANUAL_NAME, companies=name, urls=url, # login=xing_login, password=xing_password) #requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) query = session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x == name, ) query.update({XingCompanyDb.manual_entry: "Yes"}, synchronize_session="fetch") session.commit() else: query_x_p = session.query( Company.xing_page).filter(Company.name == name) xing_page = query_x_p[0][0] query = session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x == name, ) query.update({XingCompanyDb.manual_entry: "Yes"}, synchronize_session="fetch") query.update({XingCompanyDb.xing_url: xing_page}, synchronize_session="fetch") session.commit() logger.info("PROBLEMS !!!!") logger.info(name) logger.info(name) logger.info(name)
def recalculate(self, companies_names=[]): self._load_accesslogs_timestamps_to_memory() update_list = [] for index, item in enumerate( self.get_db_ip(companies_names=companies_names)): timestamps = self.accesslogs_timestamps.get( re.sub('\d+$', '0', item.ip_ip), []) session_total = self.session_total_by_host(timestamps) update_dict = { 'ip_id': item.ip_id, 'total_session_length': session_total.get('time', 0), 'total_visit_count': session_total.get('visited', 0), 'last_total_update': session_total.get('last_timestamp', time.time()), } update_list.append(update_dict) if index and index % 5000 == 0: session.bulk_update_mappings(DbIpDatabase, update_list) session.commit() update_list = [] logger.info('Updated %s records.' % str(index)) session.bulk_update_mappings(DbIpDatabase, update_list) session.commit() logger.info('Updated %s records.' % str(index)) self._log_update({'total_fields_last_full_calculation': time.time()})
def update_squirrel_rating(self, companies_names=[]): names = [] websites = [] for name in companies_names: query = session.query(Company.website).filter(Company.name == name) websites.append(query[0][0]) rating_parts = SquirrelRating().calc(companies_names, websites, True) for name in rating_parts.keys(): names.append(name) for name in names: rating_update_info = dict( mx_crm_location_level=rating_parts.get(name).get('location'), mx_crm_branch_level=rating_parts.get(name).get('branch'), mx_crm_google_evaluation=rating_parts.get(name).get( 'google_ev'), mx_crm_wiki_rating_points=rating_parts.get(name).get( 'wiki_size'), mx_crm_xing_rating_points=rating_parts.get(name).get( 'xing_size'), mx_crm_revenue_level=rating_parts.get(name).get( 'revenue_point'), squirrel_rating=rating_parts.get(name).get('score')) query = session.query(Company).filter(Company.name == name) query.update(rating_update_info, synchronize_session=False) session.commit()
def _process_google_item(self, item, spider): from sqlalchemy.exc import IntegrityError try: q = session.query(Company).filter( Company.name == item['company_name']) except IntegrityError: q = session.query(Company).filter( Company.name == item['company_name']).first() logging.info( "IIIIITTTTTTTTTTTTTEEEEEEEEEEEEMMMMMMMMMMMM@@@@@@@@@@@@@@@@@@@@") logging.info(item) if q.count() and item['update']: c = q.first() website = 'NA' if c.website: website = c.website elif c.website_long: website = urlparse.urlsplit(c.website_long)[1] if c.manual_entry == 'Yes': q.update({ 'website': item['url'], 'website_long': item['url_long'], 'website_updated': datetime.now(), 'website_old': website, 'last_update': datetime.now(), 'manual_entry': 'manual', }) logging.info("MANUAL") logging.info("MANUAL") logging.info("MANUAL") logging.info("MANUAL") elif c.manual_entry == 'old': q.update({ 'website': item['url'], 'website_long': item['url_long'], 'website_updated': datetime.now(), 'website_old': website, 'last_update': datetime.now(), 'manual_entry': 'No' }) session.commit() else: dn = datetime.now() update_item = { 'website': item['url'], 'website_long': item['url_long'], 'website_updated': datetime.now(), 'website_old': website, 'last_update': dn } logging.info(update_item) q.update(update_item) elif not q.count(): new_company = Company(name=item['company_name'], website=item['url'], website_long=item['url_long']) session.add(new_company)
def update_db_hosts(): ips = session.query(DbIpDatabase) logger.info( 'Starting update IPs ({}) from 255.255.255.255 to 255.255.255.0'. format(ips.count())) for ip in ips: ip.ip_ip = ip_digits(ip.ip_ip) session.commit()
def _log_update(self, log): calc_log = session.query(CalculationsTime).first() if not calc_log: calc_log = CalculationsTime(**log) session.add(calc_log) else: session.query(CalculationsTime).update(log) session.commit()
def fixing_wrong_old(name): query = session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x == name, ) query.update({XingCompanyDb.manual_entry: "No"}, synchronize_session="fetch") query.update({XingCompanyDb.last_update_x: func.now()}, synchronize_session="fetch") session.commit()
def mass_update(self, company_name, xing_login, xing_password, new_xing_url): xing_url = new_xing_url f = open("mx_crm/manual_queries/xing_url.txt", "w") f.write(xing_url) f.close() print('*' * 50) print('Start updating xing info for company {}'.format(company_name)) query = session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x == company_name, ) query.update({XingCompanyDb.manual_entry: "ololo"}, synchronize_session="fetch") query.update({XingCompanyDb.xing_url: new_xing_url}, synchronize_session="fetch") session.commit() print('*' * 50) project_name = 'default' scrapyd_data = {'project': project_name} decode_company_name = u'{}'.format(company_name.decode('utf-8')) print decode_company_name company_name_lower = u'update_{}'.format( decode_company_name[0].lower()) update_company_name = company_name_lower + decode_company_name[1:] print(update_company_name) companies_names = [] force_update = True companies_names.append(decode_company_name.lower()) print('Start parsing given xing url {}'.format(xing_url)) companies = q.get_companies_for_xing(companies_names, force_update) companies = SPLITTER.join(companies) scrapyd_data.update(spider=XING_NAME, companies=companies, login=xing_login, password=xing_password) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: from mx_crm.utils import get_scrapyd_jobs resp = get_scrapyd_jobs(project_name) if not len(resp['finished']): time.sleep(3) else: break requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: from mx_crm.utils import get_scrapyd_jobs resp = get_scrapyd_jobs(project_name) if not len(resp['finished']): time.sleep(3) else: break logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.xing_sync()
def log_start(self, type, description='', additional_data=''): le = LogExecutions( type=type, description=description, start_datetime=datetime.now(), additional_data=additional_data) session.add(le) session.commit() self.current_session = le
def set_wikipedia_manual_entry_manual(companies=[]): for i in companies: try: query = session.query(WikipediaDb).filter( WikipediaDb.company_name_w == i, ) query.update({WikipediaDb.manual_entry: "manual"}, synchronize_session="fetch") session.commit() except: continue
def update_google_url(self, company_name, google_url): print('*' * 50) print('Start updating google website for company {}'.format( company_name)) query = session.query(Company).filter(Company.name == company_name, ) query.update({Company.manual_entry: "yes"}, synchronize_session="fetch") query.update({Company.website: google_url}, synchronize_session="fetch") session.commit() print('*' * 50)
def update_wikipedia_url(company_name, wikipedia_url): print('*' * 50) print( 'Start updating wikipedia url for company {}'.format(company_name)) print('New url is {}'.format(wikipedia_url)) query = session.query(WikipediaDb).filter( WikipediaDb.company_name_w == company_name, ) query.update({WikipediaDb.wiki_url_w: wikipedia_url}, synchronize_session="fetch") query.update({WikipediaDb.manual_entry: "Yes"}, synchronize_session="fetch") session.commit() print( 'New wikipedia url ({0}) for company {1} have successful updated'. format(wikipedia_url, company_name)) print('*' * 50) print('Start parsing page {}'.format(wikipedia_url)) print('*' * 50) companies_dict = {company_name: wikipedia_url} print companies_dict project_name = 'default' scrapyd_data = {'project': project_name} decode_company_name = u'{}'.format(company_name.decode('utf-8')) print decode_company_name company_name_lower = u'update_{}'.format( decode_company_name[0].lower()) update_company_name = company_name_lower + decode_company_name[1:] print(update_company_name) scrapyd_data.update(spider=WIKIPEDIA_NAME, companies=update_company_name, urls=wikipedia_url) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: from mx_crm.utils import get_scrapyd_jobs resp = get_scrapyd_jobs(project_name) print(resp) if len(resp['finished']) >= 1: break time.sleep(5) logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.wiki_sync()
def _get_branch_level(self, branch_level_xing, branch_level_wiki, company): if not branch_level_xing and not branch_level_wiki: return 0 # xing_branch = None # wiki_branch = None total_branch = 0 if branch_level_xing: xing_branch = branch_level_xing.get(company.lower()) if xing_branch == -20: xing_branch = BranchEvaluationLevel().protection_calc_xing(company) if company == 'Rittal GmbH & Co. KG': xing_branch = 20 if company == 'Washtec Cleaning Technology GmbH': xing_branch = BranchEvaluationLevel().protection_calc_xing('Washtec Cleaning Technology GmbH') else: xing_branch = None if branch_level_wiki: wiki_branch = branch_level_wiki.get(company.lower()) if wiki_branch == -20: wiki_branch = BranchEvaluationLevel().protection_calc_wiki(company) if company == 'Washtec Cleaning Technology GmbH': wiki_branch = BranchEvaluationLevel().protection_calc_wiki('Washtec Cleaning Technology GmbH') else: wiki_branch = None if xing_branch is None: total_branch = wiki_branch if wiki_branch is None: total_branch = xing_branch if not xing_branch and not wiki_branch: total_branch = 0 if xing_branch and wiki_branch: total_branch = (xing_branch + wiki_branch) / 2 xing_b_for_save = branch_level_xing.get(company.lower()) wiki_b_for_save = branch_level_wiki.get(company.lower()) if xing_b_for_save is None: xing_b_for_save = 0 if wiki_b_for_save is None: wiki_b_for_save = 0 query = session.query(Company).filter(Company.name == company) query.update({Company.mx_crm_wiki_branch: wiki_b_for_save}, synchronize_session="fetch") query.update({Company.mx_crm_xing_branch: xing_b_for_save}, synchronize_session="fetch") session.commit() return total_branch
def update_wiki_company(self, company_name, wikipedia_url): company_name_for_file = u'{}'.format(company_name.decode('utf-8')) company_name = [company_name.lower()] wiki_url = wikipedia_url f = open("mx_crm/manual_queries/wiki_url.txt", "w") f.write(wiki_url.encode("utf-8")) f.close() f = io.open("mx_crm/manual_queries/wiki_company_name.txt", "w", encoding="utf-8") f.write(company_name_for_file) f.close() print('*' * 50) print('Start updating wikipedia info for company {}'.format( company_name[0])) query = session.query(WikipediaDb).filter( WikipediaDb.company_name_w == company_name[0], ) query.update({WikipediaDb.manual_entry: "manual"}, synchronize_session="fetch") session.commit() print('*' * 50) print('Start parsing given wiki url {}'.format(wiki_url)) print('*' * 50) project_name = 'default' scrapyd_data = {'project': project_name} companies_dict = q.get_companies_for_wikipedia(company_name, True) companies = companies_dict.iterkeys() companies = SPLITTER.join(companies) urls = companies_dict.values() urls = SPLITTER.join(urls) scrapyd_data.update(spider=WIKIPEDIA_NAME, companies=companies, urls=urls) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("wikipedia")) time.sleep(5) else: time.sleep(10) break logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.wiki_sync()
def get_old_google_companies(): count = 0 date_now = datetime.now() last_date = date_now - timedelta(days=3 * 365) last_date = last_date.strftime('%Y-%m-%d') old_companies = session.query(Company).filter( Company.last_update <= last_date) old_names = [] for c in old_companies: old_names.append(c.name) count += 1 pprint(count) for name in old_names[:40]: query_w_url = session.query(Company.name, Company.website).filter( Company.name == name, ) try: website = query_w_url[0][1] except IndexError: website = u'' pprint(website) if website == u'': query = session.query(Company).filter(Company.name == name, ) query.update({Company.last_update: func.now()}, synchronize_session="fetch") session.commit() elif website == u'NA': query = session.query(Company).filter(Company.name == name, ) query.update({Company.last_update: func.now()}, synchronize_session="fetch") session.commit() elif website is None: query = session.query(Company).filter(Company.name == name, ) query.update({Company.last_update: func.now()}, synchronize_session="fetch") session.commit() elif website == u'N/A': query = session.query(Company).filter(Company.name == name, ) query.update({Company.last_update: func.now()}, synchronize_session="fetch") session.commit() else: query = session.query(Company).filter(Company.name == name, ) query.update({Company.manual_entry: "old"}, synchronize_session="fetch") session.commit()
def parse(self, response): if response.status == 404: logger.info("ULALAL") query = session.query(WikipediaDb).filter( WikipediaDb.company_name_w == response.meta['company_name'], ) if query[0] == 'old': query.update({WikipediaDb.manual_entry: "No"}, synchronize_session="fetch") from sqlalchemy import func query.update({WikipediaDb.company.last_update: func.now()}, synchronize_session="fetch") session.commit() wiki_url = response.url company_name = response.meta['company_name'] infobox_content = self._get_infobox_content(u'{}'.format(response.body.decode("utf-8"))) category_content = self._get_category_content(u'{}'.format(response.body.decode("utf-8"))) summary_content = self._get_summary_content(u'{}'.format(response.body.decode("utf-8"))) logger.info("INFOBOX SYKA!!!") logger.info(infobox_content) try: website = infobox_content['wiki_company_website'] sitz = infobox_content.get('sitz') mitarbeiter = infobox_content.get('mitarbeiter') branche = infobox_content.get('branche') revenue = infobox_content.get('revenue') currency = infobox_content.get('currency') except: website = '' sitz = '' mitarbeiter = '' branche = '' revenue = '' currency = '' yield WikipediaSpiderItem(wiki_company_website=website, company_website=wiki_url, summary=summary_content, categories=category_content, sitz=sitz, mitarbeiter=mitarbeiter, branche=branche, revenue=revenue, currency=currency, company_name=company_name)
def get_companies_for_google_search(companies, force_update): """ Extracts companies' info from companies table. Searches for doubles of extracted companies and deletes them. Prepares a list of companies to perform google search. :param companies: List of companies that made requests during specified range :param force_update: force update companies info in database from spiders :return: List of companies to make google search. """ names = session.query(Company.name).filter( Company.name.in_(companies) & (Company.website != None) & (Company.website != 'NA')) names = {name[0].lower() for name in names} existing_names = session.query(Company.name).filter( Company.name.in_(companies) & ((Company.website == None) | (Company.website == 'NA'))) existing_names = {name[0].lower() for name in existing_names} to_delete = names & existing_names session.query(Company).filter( Company.name.in_(to_delete) & ((Company.website == None) | (Company.website == 'NA'))).delete( synchronize_session='fetch') session.commit() existing_names -= to_delete companies = set(companies) if force_update: names.update(existing_names) companies.update(names) companies = map(lambda c: u'update_{}'.format(c), companies) else: companies = companies - names - existing_names companies.update( {u'update_{}'.format(name) for name in existing_names}) return companies
def get_bad_revenue_wikipedia(): query = session.query(WikipediaDb).filter( WikipediaDb.revenue_currency_wiki_w != '').filter( WikipediaDb.revenue_wikipedia_w == '').filter( WikipediaDb.manual_entry != 'confirm').filter( WikipediaDb.manual_entry != 'Confirm') # query = session.query(WikipediaDb).filter( # WikipediaDb.revenue_currency_wiki_w != '' and WikipediaDb.revenue_wikipedia_w == '' # ) count = 0 update_list = [] for i in range(0, 170): print(i) query_u = session.query(WikipediaDb).filter( WikipediaDb.company_name_w == query[i].company_name_w, ) query_u.update({WikipediaDb.manual_entry: "old"}, synchronize_session="fetch") session.commit() for i in query: count += 1 print(count) print(query)
def get_drupal_sessions(start_time, end_time): """ Extracts request sessions from accesslog table. :param start_time: time to extract requests from :param end_time: time to extract requests to :return: Dictionary with sessions info separated by companies. """ logger.info("Started sessions extraction") timestamp_start_time = (start_time - datetime(1970, 1, 1)).total_seconds() timestamp_end_time = (end_time - datetime(1970, 1, 1)).total_seconds() readable_s = datetime.fromtimestamp(timestamp_start_time) readable_e = datetime.fromtimestamp(timestamp_end_time) access_hosts = session.query( Accesslog.timestamp, Accesslog.hostname, Accesslog.path, Accesslog.url, Accesslog.title ).filter( # between(Accesslog.timestamp, timestamp_start_time, timestamp_end_time), between(Accesslog.timestamp, func.unix_timestamp(start_time), func.unix_timestamp(end_time)), Accesslog.title != 'Generate image style', Accesslog.hostname.notin_(settings.IPS_BLACKLIST)).order_by( Accesslog.hostname, Accesslog.timestamp) accesslog = [Access(*res) for res in access_hosts] blacklist = { tup[0].lower() for tup in session.query(Company.name).filter( Company.type_main.in_(['Blacklist', 'Spam', 'Provider'])) } ips_info = { tup[0]: tup[1:] for tup in session.query(DbIpDatabase.ip_ip, DbIpDatabase.ip_country, DbIpDatabase.ip_name, DbIpDatabase.ip_name_2, DbIpDatabase.ip_address) } res = {} drupal_session = DrupalSession() session_length = 0 len_accesslog = len(accesslog[:-1]) - 1 for index, request in enumerate(accesslog[:-1]): host = ip_digits(request.hostname) access_datetime = datetime.fromtimestamp(int(request.timestamp)) next_request = accesslog[index + 1] next_request_host = ip_digits(next_request.hostname) next_request_access_datetime = datetime.fromtimestamp( int(next_request.timestamp)) difference = next_request_access_datetime - access_datetime is_continue = False if host == next_request_host and difference.seconds < settings.MAXIMUM_DIFFERENCE_BETWEEN_SESSIONS.seconds: session_length += difference.seconds is_continue = True elif host == next_request_host: session_length += settings.LONG_SESSION_DEFAULT is_continue = True elif host != next_request_host: session_length += settings.LONG_SESSION_DEFAULT if index and host == ip_digits( accesslog[index - 1].hostname) and host != next_request_host: drupal_session.append(request) elif host == next_request_host: drupal_session.append(request) is_continue = True if is_continue and index != len_accesslog: continue if host in ips_info: country, company_name, address_result, full_address_result = ips_info[ host] else: country = company_name = address_result = full_address_result = '' try: country, company_name, address_result, full_address_result = get_whois( host) except Exception as e: logger.error( 'get_whois function (RIPE) got an error for host: {}\nError: {}' .format(host, str(e))) continue finally: address_result = address_result[:250] logger.debug(address_result) full_address_result = full_address_result[:350] new_entry = DbIpDatabase(ip_ip=host, ip_country=country, ip_name=company_name, ip_name_2=address_result, ip_address=full_address_result, ip_host=host, ip_timestamp=func.now()) session.add(new_entry) ips_info[host] = (country, company_name, address_result, full_address_result) company_name = company_name.lower() if company_name and country in settings.RELEVANT_COUNTRIES \ and company_name not in settings.PROVIDERS_BLACKLIST \ and company_name not in blacklist \ and not any(word in company_name for word in settings.COMPANIES_BLACKLIST) \ and not any(re.search(regexp, company_name) for regexp in settings.PROVIDERS_BLACKLIST_REGEXPS) \ and not any(re.search(regexp, company_name) for regexp in settings.COMPANIES_BLACKLIST_REGEXPS): if company_name not in res: res[company_name] = CompanyEntry(*ips_info[host], sessions=[]) res[company_name].sessions.append(drupal_session) res[company_name].session_length = timedelta( seconds=session_length) drupal_session = DrupalSession() session_length = 0 session.commit() logger.info('Sessions extraction has been finished successfully.') return res
def update(self, days, **kwargs): import datetime force_update = True date_now = datetime.datetime.now() start_date, end_date = prepare_date_to_drupal_execute(days, **kwargs) drupal_companies = q.get_drupal_sessions(end_date, start_date) companies_names = drupal_companies.keys() companies_names = map(lambda c: c.lower(), companies_names) logger.debug('Found companies: {}'.format(companies_names)) logger.debug('Count of founded companies: {}'.format( len(companies_names))) companies_wiki = {} companies_xing = {} companies_google = {} companies_next_list = [] finish_companies_list = [] #imported_companies = OneYearUpdate().import_companies_update() for company in drupal_companies.keys(): companies_next_list.append(company) finish_companies_list = companies_next_list #finish_companies_list = companies_next_list + imported_companies for company in finish_companies_list: try: query_w = session.query(WikipediaDb.last_update_w).filter( WikipediaDb.company_name_w == company) try: if query_w[0][0]: date_diff_w = date_now - query_w[0][0] if date_diff_w.days > 365: companies_wiki[company] = drupal_companies[company] except IndexError: continue except KeyError: continue for company in finish_companies_list: try: query_x = session.query(XingCompanyDb.last_update_x).filter( XingCompanyDb.company_name_x == company) try: if query_x[0][0]: date_diff_x = date_now - query_x[0][0] if date_diff_x.days > 365: companies_xing[company] = drupal_companies[company] except IndexError: continue except KeyError: continue for company in finish_companies_list: try: query_g = session.query( Company.last_update).filter(Company.name == company) try: if query_g[0][0]: date_diff_g = date_now - query_g[0][0] if date_diff_g.days > 365: companies_google[company] = drupal_companies[ company] except IndexError: continue except KeyError: continue companies_names_wiki = companies_wiki.keys() companies_names_wiki = map(lambda c: c.lower(), companies_names_wiki) companies_names_xing = companies_xing.keys() companies_names_xing = map(lambda c: c.lower(), companies_names_xing) companies_names_google = companies_google.keys() companies_names_google = map(lambda c: c.lower(), companies_names_google) logger.debug('Companies to update for wikipedia: {}'.format( companies_names_wiki)) logger.debug('Count of companies to update for wikipedia: {}'.format( len(companies_names_wiki))) logger.debug( 'Companies to update for xing: {}'.format(companies_names_xing)) logger.debug('Count of companies to update for xing: {}'.format( len(companies_names_xing))) logger.debug('Companies to update google evaluation: {}'.format( companies_names_google)) logger.debug( 'Count of companies to update google evaluation: {}'.format( len(companies_names_google))) for name in companies_names_wiki: pprint(name) query_w_url = session.query( WikipediaDb.company_name_w, WikipediaDb.wiki_url_w).filter( WikipediaDb.company_name_w == name, ) try: wiki_url = query_w_url[0][1] except IndexError: xing_url = u'' pprint(wiki_url) if wiki_url != u'': query = session.query(WikipediaDb).filter( WikipediaDb.company_name_w == name, ) query.update({WikipediaDb.manual_entry: "old"}, synchronize_session="fetch") session.commit() elif wiki_url == u'NA': query_w_u = session.query( Company.wikipedia_url).filter(Company.name == name) wiki_page = query_w_u[0][0] query = session.query(WikipediaDb).filter( WikipediaDb.company_name_w == name, ) query.update({WikipediaDb.manual_entry: "old"}, synchronize_session="fetch") query.update({WikipediaDb.wiki_url_w: wiki_page}, synchronize_session="fetch") session.commit() elif wiki_url == u'N/A': query_w_u = session.query( Company.wikipedia_url).filter(Company.name == name) wiki_page = query_w_u[0][0] query = session.query(WikipediaDb).filter( WikipediaDb.company_name_w == name, ) query.update({WikipediaDb.manual_entry: "old"}, synchronize_session="fetch") query.update({WikipediaDb.wiki_url_w: wiki_page}, synchronize_session="fetch") session.commit() else: query_w_u = session.query( Company.wikipedia_url).filter(Company.name == name) wiki_page = query_w_u[0][0] query = session.query(WikipediaDb).filter( WikipediaDb.company_name_w == name, ) query.update({WikipediaDb.manual_entry: "old"}, synchronize_session="fetch") query.update({WikipediaDb.wiki_url_w: wiki_page}, synchronize_session="fetch") session.commit() for name in companies_names_xing: pprint(name) query_x_url = session.query( XingCompanyDb.company_name_x, XingCompanyDb.xing_url).filter( XingCompanyDb.company_name_x == name, ) try: xing_url = query_x_url[0][1] except IndexError: xing_url = u'' pprint(xing_url) if xing_url != u'': query = session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x == name, ) query.update({XingCompanyDb.manual_entry: "old"}, synchronize_session="fetch") session.commit() elif xing_url != u'NA': query_x_p = session.query( Company.xing_page).filter(Company.name == name) xing_page = query_x_p[0][0] query = session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x == name, ) query.update({XingCompanyDb.manual_entry: "old"}, synchronize_session="fetch") query.update({XingCompanyDb.xing_url: xing_page}, synchronize_session="fetch") session.commit() elif xing_url != u'N/A': query_x_p = session.query( Company.xing_page).filter(Company.name == name) xing_page = query_x_p[0][0] query = session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x == name, ) query.update({XingCompanyDb.manual_entry: "old"}, synchronize_session="fetch") query.update({XingCompanyDb.xing_url: xing_page}, synchronize_session="fetch") session.commit() else: query_x_p = session.query( Company.xing_page).filter(Company.name == name) xing_page = query_x_p[0][0] query = session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x == name, ) query.update({XingCompanyDb.manual_entry: "old"}, synchronize_session="fetch") query.update({XingCompanyDb.xing_url: xing_page}, synchronize_session="fetch") session.commit() for name in companies_names_google: pprint(name) query_g_url = session.query(Company).filter(Company.name == name, ) query_g_url.update({Company.manual_entry: "old"}, synchronize_session="fetch") session.commit()
def close_spider(self, spider): session.commit()
def create_report(companies, account_data=[], account_headers=[], total_fields=[], data_links={}, google_analytics_companies={}, dates={}): """ Creates and saves locally report. :param companies: List of companies that made requests during specified range """ logger.debug(companies) file_name = settings.REPORTS_FILE.format(now=datetime.datetime.now().strftime("%Y_%m_%d-%H_%M_%S")) path_to_xl = settings.rel('mx_crm', settings.REPORTS_FOLDER, file_name) logger.debug('Export excel file: {}'.format(path_to_xl)) wb = Workbook() ws = wb.create_sheet('Report') logger.info('Saving report to the local excel file') wb_headers = settings.NEW_WORKBOOK_HEADERS # wb_headers = settings.WORKBOOK_HEADERS if account_headers: wb_headers += account_headers if total_fields: wb_headers += settings.TOTAL_HEADERS wb_headers += settings.RATING_HEADERS ws.append(wb_headers) companies_info = get_companies_info(companies) logger.info('companies_info') logger.info(companies_info) companies_info_manual_id = get_company_table_info(companies) logger.info('companies_info_manual_id') logger.info(companies_info_manual_id) # manual companies_info_websites = get_companies_info_websites(companies) logger.debug('Companies: {}'.format(len(companies_info))) companies_wiki_info = get_wiki_info(companies) logger.debug('Wiki companies: {}'.format(len(companies_wiki_info))) companies_xing_info = get_xing_info(companies) logger.debug('Xing companies: {}'.format(len(companies_xing_info))) companies_names = set() websites_for_rating = set() for c in companies_info.values(): if c.website: websites_for_rating.add(c.website) if c.name: companies_names.add(c.name) rating_data = SquirrelRating().calc(companies=companies_names, websites=websites_for_rating) company_manual_account = get_manual_account(companies_names) variables_data = SquirrelRating().get_rating_variables(companies, websites_for_rating) #logger.info("rating data {}".format(rating_data)) #logger.info("rating data {}".format(type(rating_data))) try: counter = 0 for company_name, company in sorted(companies.items(), key=lambda x: x[1].session_length, reverse=True): ws.row_dimensions[counter].collapsed = True address = company.full_address country = company.country # rating = rating_data.get(company.company_name).get('score') wiki_info = companies_wiki_info.get(company_name) xing_info = companies_xing_info.get(company_name) company_info = companies_info.get(company_name) company_table_manual_id = companies_info_manual_id.get(company_name) website = company_info.website if company_info else '' full_website = re.sub('www\d?\.', '', website).rstrip('/').lower() prepared_company_name = company_name xing_page = company_info.xing_page if company_info else None session_length = company.session_length for session in company.sessions: for request in session.requests: #master_company = alchemy_session.query(Company.name).filter(Company.name == company.company_name) access_history = MxCrmAccessHistory( company_name=company.company_name, a_h_sid=counter, mx_crm_visited_page=request.title, mx_crm_referrer=request.url[:255], mx_crm_session_date=datetime.datetime.fromtimestamp(int(request.timestamp)).strftime( '%Y-%m-%d'), mx_crm_session_time=datetime.datetime.fromtimestamp(int(request.timestamp)).strftime( '%H:%M:%S'), mx_crm_ip_vlan=request.hostname ) alchemy_session.add(access_history) alchemy_session.commit() sheet_counter = 2 company_table_info = get_manual_website(company.company_name) access_dt = datetime.datetime.fromtimestamp(request.timestamp).strftime('%Y-%m-%d %H:%M:%S') rcd_name_rating = companies_info.get(company_name) if rcd_name_rating and rcd_name_rating.name: rating = rating_data.get(rcd_name_rating.name, 'N/C') if rating_data.get( rcd_name_rating.name) is not None else 'N/C' if company_name in total_fields: obj = total_fields.get(company_name, {}) total_session_lenght = datetime.timedelta(seconds=obj.get('time') or 0) # row = [company.company_name] sheet_number = 'A{}'.format(sheet_counter) # ws[sheet_number].hyperlink = "http://google.com" # ws[sheet_number].value = company.company_name # ws.cell(row=1, column=sheet_counter).value = '=HYPERLINK("{}", "{}")'.format('google.com', company.company_name) link = '' # pprint(company.company_name) link = data_links.get(company.company_name.lower()) c_id = alchemy_session.query(Company.id).filter(Company.name == company.company_name) try: company_id = c_id[0][0] webinterface_link = "http://192.168.0.141:8000/squirrel/accounts/{}/".format(company_id) except IndexError: company_id = '' webinterface_link = "http://192.168.0.141:8000/squirrel/accounts/search/{}/".format(company.company_name) # pprint(link) query_link = alchemy_session.query(Company).filter(Company.name == company.company_name) query_link.update({Company.d_crm_link: link}, synchronize_session="fetch") alchemy_session.commit() row = ['=HYPERLINK("{}", "{}")'.format(webinterface_link, company.company_name), company_table_info.get(company.company_name), website, session_length, total_session_lenght, rating_data.get(company.company_name), address, request.title, request.url, access_dt, country] sheet_counter += 1 # pprint(type(row)) if wiki_info: row.extend([ wiki_info.manual_entry, wiki_info.wiki_url_w, convert_to_float(wiki_info.revenue_wikipedia_w), wiki_info.revenue_currency_wiki_w, convert_to_int(wiki_info.employees_wikipedia_w), wiki_info.categories_wikipedia_w, wiki_info.branch_wikipedia_w, wiki_info.summary_wikipedia_w, ]) else: row.extend([''] * 8) if xing_info: if company_table_manual_id.manual_account_id: c_t_manual_id = company_table_manual_id.manual_account_id elif company_table_manual_id.manual_account_id == u'': c_t_manual_id = u'NONE' elif company_table_manual_id.manual_account_id == '': c_t_manual_id = u'NONE' else: c_t_manual_id = u'NONE' row.extend([ xing_info.manual_entry, xing_page, xing_info.country_xing, xing_info.employees_group_xing_x, xing_info.employees_size_xing, xing_info.description_xing, xing_info.industry_xing, c_t_manual_id # company_manual_account.get(company_name) ]) else: row.extend([''] * 8) if full_website in account_data or prepared_company_name in account_data: data_to_extend = [] for key in account_headers: if full_website in account_data: value = account_data[full_website].get(key, '') else: value = account_data[prepared_company_name].get(key, '') data_to_extend.append(value) row.extend(data_to_extend) elif account_headers: row.extend([''] * len(account_headers)) if company_name in total_fields: obj = total_fields.get(company_name, {}) row.extend([ datetime.timedelta(seconds=obj.get('time') or 0), convert_to_int(obj.get('visited')), obj.get('last_visited'), ]) else: row.extend([''] * len(settings.TOTAL_HEADERS)) rcd_name = companies_info.get(company_name) if rcd_name and rcd_name.name: if wiki_info: row.extend([ wiki_info.manual_entry ]) else: row.extend([""]) if xing_info: row.extend([ xing_info.manual_entry ]) else: row.extend([""]) query = alchemy_session.query(Company).filter(Company.name == rcd_name.name) dict_for_save = dict(mx_crm_location_level=variables_data.get(rcd_name.name).get('location'), mx_crm_branch_level=variables_data.get(rcd_name.name).get('branch'), mx_crm_google_evaluation=variables_data.get(rcd_name.name).get( 'google_ev'), mx_crm_wiki_rating_points=variables_data.get(rcd_name.name).get( 'wiki_size'), mx_crm_xing_rating_points=variables_data.get(rcd_name.name).get( 'xing_size'), mx_crm_revenue_level=variables_data.get(rcd_name.name).get( 'revenue_point')) rating_update_info = dict( mx_crm_location_level=variables_data.get(rcd_name.name).get('location'), mx_crm_branch_level=variables_data.get(rcd_name.name).get('branch'), mx_crm_google_evaluation=float(variables_data.get(rcd_name.name).get('google_ev')), mx_crm_wiki_rating_points=variables_data.get(rcd_name.name).get('wiki_size'), mx_crm_xing_rating_points=variables_data.get(rcd_name.name).get('xing_size'), mx_crm_revenue_level=variables_data.get(rcd_name.name).get('revenue_point')) query.update(rating_update_info, synchronize_session=False) relation_ship_type = row[36] account_name = row[27] account_owner = row[28] abc_rating = row[38] closed_activity_type = row[31] if row[32] != '': closed_date = row[32] else: closed_date = None # closed_date = datetime.datetime.strptime(str(row[32]), '%m/%d/%Y %H:%M:%S') open_activity_type = row[33] if row[34] != '': schedule_date = row[34] else: schedule_date = None # schedule_date = datetime.datetime.strptime(str(row[34]), '%m/%d/%Y %H:%M:%S') total_session_length = row[39] total_visited_page = row[40] last_visit_time = row[41] alchemy_session.commit() dynamics_crm_info = dict(d_crm_relationship_type=relation_ship_type, d_crm_account_name=account_name, d_crm_account_owner=account_owner, d_crm_abc_rating=abc_rating, d_crm_closed_activity_type=closed_activity_type, d_crm_open_activity_type=open_activity_type, d_crm_closed_date=closed_date, d_crm_schedule_date=schedule_date, mx_crm_total_session_length=total_session_length, mx_crm_total_visited_pages=total_visited_page, mx_crm_last_visit=last_visit_time, squirrel_rating=rating_data.get(rcd_name.name)) #webinterface_link=webinterface_link) # also in this query save webinterface link query_dynamics_crm = alchemy_session.query(Company).filter(Company.name == rcd_name.name) query_dynamics_crm.update(dynamics_crm_info, synchronize_session=False) alchemy_session.commit() row.extend([ rating_data.get(rcd_name.name, 'N/C') if rating_data.get( rcd_name.name) is not None else 'N/C', ]) row.extend([ variables_data.get(rcd_name.name).get('location') ]) row.extend([ variables_data.get(rcd_name.name).get('branch') ]) row.extend([ variables_data.get(rcd_name.name).get('google_ev') ]) row.extend([ variables_data.get(rcd_name.name).get('wiki_size') ]) row.extend([ variables_data.get(rcd_name.name).get('xing_size') ]) row.extend([ variables_data.get(rcd_name.name).get('revenue_point') ]) else: row.extend(['N/C'] * len(settings.RATING_HEADERS)) try: ws.append(row) except ValueError as e: logger.info(e) counter += 1 if not ws.row_dimensions[counter - 1].collapsed: ws.row_dimensions[counter].hidden = True ws.row_dimensions[counter].outlineLevel = 1 wb.save(path_to_xl) d_start = dates.get('start_date') e_date = dates.get('end_date') start_date = datetime.datetime(d_start.year, d_start.month, d_start.day) end_date = datetime.datetime(e_date.year, e_date.month, e_date.day) # g_a_c = get_google_analytics_sessions(start_date, end_date, True) # logger.info(g_a_c) # logger.info(google_analytics_companies) # result = add_google_analytics_accounts_to_report_file(path_to_xl, start_date, end_date) # os.chdir("C:/Users/admin/PycharmProjects/SquirrelRunnerNew/mx_crm") # cd = os.system('python add_companies.py --days_start={0} --year_start={1} --month_start={2} --days_end={3} --year_end={4} --month_end={5}'.format( # d_start.day, d_start.year, d_start.month, e_date.day, e_date.year, e_date.month # )) # logger.info(cd) except KeyError as e: logger.error(e) logger.info('Local file has been updated')
def log_end(self, status, error): self.current_session.error = error self.current_session.status = status self.current_session.end_datetime = datetime.now() session.commit()