示例#1
0
	def update_cb_basic_company(self):
		df = db.pandas_read(sql.sql_cb_basic_company.value)
		for _, r in df.iterrows():
			basic_name = CM.get_basic_name(r['name'])
			sql_update = sql.sql_cb_basic_company_update.value.format(basic_name, CM.sql_compliant(r['org_uuid']))
			print(sql_update)
			db.execute(sql_update)
示例#2
0
    def schema_to_dfs(self, schema, which_tables="all"):
        """
        Takes name of schema and returns dict of dataframes, one entry for each table in schema.
        Key = table name
        Value = dataframe
        Dataframes will contain results from SELECT * FROM TABLE.
        :param schema:
        :return dict:
        """
        select_all = "SELECT * FROM "
        sql = str(
            "SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = '" +
            str(schema) + "'")
        tables = DAL.pandas_read(sql)

        if which_tables == "all":
            table_names = tables["TABLE_NAME"]
        else:
            table_names = which_tables

        table_dict = {}
        sql_statements = []
        for table in table_names:
            statement = str(select_all + schema + "." + table)
            sql_statements.append(statement)
            table_dict[table] = 0

        for statement in sql_statements:
            df = DAL.pandas_read(statement)
            table_name = statement.split(".")[1]
            for key in table_dict.keys():
                if key.lower() == table_name.lower():
                    table_dict[key] = df

        return table_dict
示例#3
0
	def push_entity_to_db(self, json, org_uuid, sql_insert, uuid, i=0, fk_uuid='org_uuid', columns=[]):
		try:
			json_properties = None
			if CBDict.properties.value in json.keys():
				json_properties = json[CBDict.properties.value]
			elif json[CBDict.cardinality.value] == 'OneToOne':
				json_properties = json[CBDict.item.value][CBDict.properties.value]
			elif json[CBDict.cardinality.value] == 'OneToMany':
				json_properties = json[CBDict.items.value][i][CBDict.properties.value]
			if 'uuid' not in json_properties.keys():
				json_properties['uuid'] = uuid
			if fk_uuid not in json_properties.keys():
				json_properties[fk_uuid] = org_uuid
			# print(list(json_properties.keys()))
			df_properties = pd.DataFrame([json_properties], columns=json_properties.keys())
			if len(columns) > 0:
				df_properties = df_properties[columns]
			values = CM.df_list(df_properties)
			val = []
			for l, j in enumerate(values[0]):
				if isinstance(values[0][l], list):
					val.append(' , '.join(str(x) for x in values[0][l]))
				elif isinstance(values[0][l], str):
					val.append(self.common.sql_compliant(values[0][l]))
				else:
					val.append(values[0][l])
			db.bulk_insert(sql_insert, [val])
		except Exception as ex:
			print(ex)
示例#4
0
	def update_tdw_basic_company(self):
		df = db.pandas_read(sql.sql_tdw_basic_company.value)
		for _, r in df.iterrows():
			basic_name = CM.get_basic_name(r.legal_name)
			sql_update = sql.sql_tdw_basic_company_update.value.format(basic_name, CM.sql_compliant(r.legal_name))
			print(sql_update)
			db.execute(sql_update)
示例#5
0
    def __init__(self):
        self.db = DB()

        self.training_file = 'company_traning.json'
        self.setting_file = 'company_settings'
        self.result_set = None
        self.start_time = time.time()
示例#6
0
 def __init__(self):
     self.source_table = 'MDC_DEV.dbo.SourceTable'
     self.valid = validate()
     self.data = db.pandas_read(
         "SELECT * FROM MDC_DEV.dbo.ProcessedVenture").to_dict('index')
     self.source = db.pandas_read(
         'SELECT * FROM MDC_DEV.dbo.SourceTable WHERE ID < 0').to_dict(
             'index')
示例#7
0
	def transfer_fact_ric_aggregation():
		date_id = COM.get_dateid(datevalue=None)
		metric_prg = [130, 132, 133, 129, 134, 63, 77, 60, 68, 67, 135, 136, 137]
		metric_prg_youth = [134, 138]
		
		df_program = db.pandas_read(sql.sql_company_aggregate_program.value.format(2018, 4))#(BapQuarterly.year, BapQuarterly.quarter))
		df_program_youth = db.pandas_read(sql.sql_company_aggregate_program_youth.value.format(2018, 4))#(BapQuarterly.year, BapQuarterly.quarter))
		
		values = []
		
		for _, row in df_program.iterrows():
			i = 7
			while i < 20:
				m = i - 7
				val = []
				val.append(int(row['DataSource']))  # DataSource
				val.append(int(date_id))  # RICDateID
				val.append(int(metric_prg[m]))  # MetricID
				val.append(int(row['BatchID']))  # BatchID
				
				if str(row[i]) in ['no data', 'n\\a', '-', 'n/a', 'nan']:
					val.append(-1.0)
					print(row[i])
				else:
					val.append(round(float(row[i]), 2))  # AggregateNumber
				val.append(str(datetime.datetime.today())[:23])  # ModifiedDate
				val.append(str(datetime.datetime.today())[:23])  # CreatedDate
				val.append(row['Youth'])  # Youth
				values.append(val)
				i = i + 1
				# db.execute(sql.sql_bap_fra_insert.value.format(tuple(val)))
		
		for _, row in df_program_youth.iterrows():
			
			j = 7
			while j < 9:
				m = j - 7
				val = []
				val.append(int(row['DataSource']))  # DataSource
				val.append(int(date_id))  # RICDateID
				val.append(int(metric_prg_youth[m]))  # MetricID
				val.append(int(row['BatchID']))  # BatchID
				if str(row[j]) in ['no data', 'n\\a', '-', 'n/a', 'nan']:
					val.append(-1.0)
					print(row[j])
				else:
					val.append(round(float(row[j]), 2))  # AggregateNumber
				val.append(str(datetime.datetime.today())[:23])  # ModifiedDate
				val.append(str(datetime.datetime.today())[:23])  # CreatedDate
				val.append(row['Youth'])  # Youth
				
				values.append(val)
				j = j + 1
				# db.execute(sql.sql_bap_fra_insert.value.format(tuple(val)))
		for val in range(len(values)):
			print('{}. {}'.format(val,values[val]))
			# print('{}. {}'.format(val,values[val][1]))
		db.bulk_insert(sql.sql_bap_fact_ric_aggregation_insert.value, values)
示例#8
0
	def push_bap_missing_data_to_temp_table():
		 current_path = os.path.join(os.path.expanduser("~"), '/Users/mnadew/Box Sync/Workbench/BAP/BAP_FY18/FY18_Q3/for ETL/Missing data Reports')
		 os.chdir(current_path)
		 df = pd.read_excel('00 BAP Missing data Combined.xlsx', 'BAP Missing data')
		 df['CompanyID'] = 0
		 new_col = ['CompanyID','CompanyName','BasicName','Website','AnnualRevenue','NumberOfEmployees','FundingToDate','DataSource']
		 dfs = df[new_col]
		 sql = 'INSERT INTO BAP.BAP_FY18Q3_Missing_Data VALUES (?, ?, ?, ?, ?, ?, ?, ?)'
		 values = COM.df_list(dfs)
		 db.bulk_insert(sql, values)
示例#9
0
 def delete_old_ans():
     # delete old ans using answer ids
     #   store old ans in xl file
     old_ans_sql = CM.get_config('config.ini', 'secondary_etl', 'old_ans')
     old_ans_df = DB.pandas_read(old_ans_sql)
     DBInteractions.store_df(old_ans_df, '_OLD_PIPE_ANS')
     #   run sql to delete old ans
     del_old_ans_sql = CM.get_config('config.ini', 'secondary_etl',
                                     'del_old_ans')
     DB.execute(del_old_ans_sql)
示例#10
0
 def update(self, table_name, source_id_col, company_id_col):
     etl = common.df_list(
         db.pandas_read('SELECT ' + source_id_col + ',' + company_id_col +
                        ' FROM ' + table_name))
     for index1, val1 in enumerate(self.source_table):
         for index2, val2 in enumerate(etl):
             if val1[0] == str(val2[0]):
                 db.execute('UPDATE ' + table_name + ' SET ' +
                            company_id_col + ' = ' + str(val1[1]) +
                            ' WHERE ' + source_id_col + ' = ' +
                            str(val2[0]))
                 break
示例#11
0
	def create_bap_batch():
		batch = BatchService()
		program = db.pandas_read(sql.sql_bap_distinct_batch.value.format(tbl.ric_program.value,BapQuarterly.year,BapQuarterly.quarter))
		program_youth = db.pandas_read(sql.sql_bap_distinct_batch.value.format(tbl.ric_program_youth.value, BapQuarterly.year,BapQuarterly.quarter))
		company = db.pandas_read(sql.sql_bap_distinct_batch.value.format(tbl.venture_data.value, BapQuarterly.year, BapQuarterly.quarter))
		comapny_annual = db.pandas_read(sql.sql_annual_bap_distinct_batch.value.format(tbl.venture_annual.value,BapQuarterly.year))

		# batch.create_bap_batch(program, BapQuarterly.year, BapQuarterly.quarter, tbl.ric_program.value, WS.bap_program.value, ss.RICPD_bap.value)
		# batch.create_bap_batch(program_youth, BapQuarterly.year, BapQuarterly.quarter, tbl.ric_program_youth.value, WS.bap_program_youth.value, ss.RICPDY_bap.value)
		batch.create_bap_batch(company, BapQuarterly.year, BapQuarterly.quarter, tbl.venture_data.value, WS.bap_company.value, ss.RICCD_bap.value)
		if BapQuarterly.quarter == 3:
			batch.create_bap_batch(comapny_annual, BapQuarterly.year, BapQuarterly.quarter, tbl.venture_annual.value, WS.bap_company_annual.value, ss.RICACD_bap.value)
示例#12
0
	def transfer_fact_ric_company_data():
		df = db.pandas_read(sql.sql_bap_fact_ric_data_fyq4.value)
		df_frc = BapQuarterly.get_proper_values(df)
		# BapQuarterly.update_month_year(df_frc)
		# df_frc['IntakeDate'] = pd.to_datetime(df_frc['IntakeDate'])
		df_frc['Age'] = None
		# df_frc['Date of Incorporation'] = pd.to_datetime(df_frc['Date of Incorporation'])
		# df_ric = df_frc.drop(columns=['ID', 'Incorporate year (YYYY)', 'Incorporation month (MM)'])
		# BapQuarterly.file.save_as_csv(df_frc, '00 FactRICCompany.xlsx', os.getcwd(), 'FactRICCompany')
		values_list = COM.df_list(df_frc)

		db.bulk_insert(sql.sql_bap_fact_ric_company_insert.value, values_list)
示例#13
0
    def nomatch_create_new(self):
        """Add non-duplicate ventures that are new companies (-ve ID) as new ventures to the venture table """
        new_ventures = common.df_list(
            db.pandas_read(
                "SELECT * FROM MDC_DEV.dbo.ProcessedVenture AS a WHERE a.ID NOT IN "
                "(SELECT ID FROM MDC_DEV.dbo.EntityMap) AND a.ID < 0 "))
        sql = 'INSERT INTO MDC_DEV.dbo.Venture VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)'
        db.bulk_insert(sql, new_ventures)

        # Update ID to match Venture Table in the given source table
        if self.source_table is not None:
            sql = 'UPDATE ' + self.source_table + ' SET ID = b.ID FROM ' + self.source_table + ' AS a INNER JOIN MDC_DEV.dbo.Venture AS b ON a.Name = b.Name'
            db.execute(sql)
示例#14
0
	def move_annual_company_data(self):
		i, j = 0, 0
		dfac = db.pandas_read('SELECT ID, BatchID, CompanyID,[Company Name] FROM BAP.AnnualCompanyData')
		dfdc = db.pandas_read('SELECT CompanyID, CompanyName FROM Reporting.DimCompany')
		dfac['BasicName'] = dfac.apply(lambda dfs: CM.get_basic_name(dfs['Company Name']), axis=1)
		dfdc['BasicName'] = dfdc.apply(lambda dfs: CM.get_basic_name(dfs.CompanyName), axis=1)
		for i, c in dfac.iterrows():
			dfc = dfdc[dfdc['BasicName'] == c.BasicName]
			val = dict()
			if len(dfc) > 0:
				i+=1
				db.execute(sql.sql_annual_comapny_data_update.value.format(dfc.CompanyID.values[0], c.ID))
				print(sql.sql_annual_comapny_data_update.value.format(dfc.CompanyID.values[0], c.ID))
			else:
				j+=1
				print(sql.sql_dim_company_insert.value)
				new_com_id = self.batch.get_table_seed('MaRSDataCatalyst.Reporting.DimCompany', 'CompanyID') + 1
				val['CompanyID'] = new_com_id
				val['Company Name'] = c['Company Name']
				val['Description'] = None
				val['Phone'] = None
				val['Phone2'] = None
				val['Fax'] = None
				val['Email'] = None
				val['Website'] = None
				val['CompanyType'] = None
				val['BatchID'] = c.BatchID
				val['ModifiedDate'] = str(dt.datetime.utcnow())[:-3]
				val['CreatedDate'] = str(dt.datetime.utcnow())[:-3]
				df = pd.DataFrame([val], columns=val.keys())
				values = CM.df_list(df)
				db.bulk_insert(sql.sql_dim_company_insert.value, values)
				db.execute(sql.sql_annual_comapny_data_update.value.format(new_com_id, c.ID))
		print('{} exists and {} doesn not exist'.format(i, j))
示例#15
0
 def __init__(self):
     self.MDCReport = common.df_list(
         db.pandas_read(
             'SELECT RICCompanyDataID, CompanyID,DataSource,BatchID,DateID,AdvisoryServicesHours,'
             'VolunteerMentorHours, AnnualRevenue, NumberEmployees,FundingToDate, FundingCurrentQuarter, '
             'HighPotential,SocialEnterprise '
             'FROM MDCReport.BAPQ.FactRICCompany'))
     self.MaRSDataCatalyst = common.df_list(
         db.pandas_read(
             'SELECT RICCompanyDataID, CompanyID,DataSourceID,BatchID,DateID,AdvisoryServicesHours,'
             'VolunteerMentorHours, AnnualRevenue, NumberEmployees,FundingToDate, FundingCurrentQuarter, '
             'HighPotential,SocialEnterprise FROM MaRSDataCatalyst.Reporting.FactRICCompanyData'
         ))
     self.records = []
示例#16
0
def _main_():
    print("Getting SQL query")
    sql = CM.get_config("config_sql.ini", "ann_survey_18", "caprevjob_by_ric")
    print("SQL: {}".format(sql))
    print("Executing SQL to get dataframe of results")
    all_results = DB.pandas_read(sql)

    print("Creating column names")
    all_results['ConcatQ'] = all_results[['Cap/Rev/Emp', 'Question']].apply(lambda x: ' - '.join(x), axis=1)
    print("Splitting dataframe into one per RIC")
    split_frames = partition_by(all_results, "RIC_Program")
    print("Getting write path")
    user_path = os.path.expanduser("~")
    path = user_path + "/Box Sync/Workbench/BAP/Annual Survey FY2018/Results by RIC/"
    print("Path: {}".format(path))

    print("Writing files to disc:")
    for ric in split_frames.keys():
        x = split_frames[ric]
        x['rid_cid'] = x['resp_id'].astype(str) + '_' + x['Company_ID'].astype(str)
        x = spread(x, 'rid_cid', 'ConcatQ', 'Answer')
        x['rid_cid'] = x.index
        x['_resp_id'], x['_Company_ID'] = x['rid_cid'].str.split('_', 1).str
        x = x.apply(pd.to_numeric, errors='ignore')
        cols = x.columns.tolist()
        cols = cols[-2:] + cols[:-2]
        x = x[cols]
        for i in range(len(cols)):
            if str(cols[i])[0] == '_':
                cols[i] = cols[i][1:]
        x.columns = cols
        x = x.drop('rid_cid', axis=1)
        filename = "{} Survey Results".format(ric)
        write_to_xl(x, filename, path, 'Results')
        print("Wrote {} to path: {}".format(filename, path))
示例#17
0
	def get_table_seed(self, table, id_column):
		seed = 0
		sql_dc = sql.sql_get_max_id.value.format(id_column, table)
		df = db.pandas_read(sql_dc)
		if len(df) > 0:
			seed = df.values[0][0]
		return seed
示例#18
0
	def split_venture_former_name(self):
		df = db.pandas_read('SELECT ID, CompanyName, [Former / Alternate Names] FROM MDCRaw.BAP.VentureQuarterlyData WHERE CompanyName LIKE \'%(%\' AND FiscalYear = 2019')
		for _, row in df.iterrows():
			split = CM.venture_name_with_bracket_split(row['CompanyName'])
			# print('Current: {}\nName: {}\nAlternate: {}'.format(row['CompanyName'],split[0], split[1].replace('(','').replace(')','')))
			# print('-' * 100)
			update = '''UPDATE MDCRaw.BAP.VentureQuarterlyData SET CompanyName = \'{}\' , [Former / Alternate Names] = \'{}\' WHERE ID = {} -- {}'''
			print(update.format(split[0], split[1].replace('(','').replace(')','').replace('formerly',''),row['ID'],row['CompanyName']))
示例#19
0
	def save_orgs_relationship(self, api_url):
		url = api_url + self.api_org_token
		print(url)
		orgs = CM.get_api_data(url)
		if orgs.ok:

			self.org_uuid = orgs.json()[CBDict.data.value][CBDict.uuid.value]
			df = self.db.pandas_read(self.enum.SQL.sql_org_detail_exists.value.format(self.org_uuid))
			if len(df) == 0:#self.db.entity_exists('MDCRaw.CRUNCHBASE.Organization', 'org_uuid', self.org_uuid):
				self.save_organization_detail(self.org_uuid, orgs.json()[CBDict.data.value][CBDict.properties.value])
				rs_json = orgs.json()[CBDict.data.value][CBDict.relationships.value]
				self.save_funding_rounds(rs_json['funding_rounds'], self.org_uuid)
				self.save_relational_entity(rs_json[self.enum.CBDict.headquarters.value], self.org_uuid,
											self.enum.SQL.sql_offices_exists.value, self.enum.SQL.sql_offices_insert.value,
											self.office_col)
				self.save_relational_entity(rs_json['categories'], self.org_uuid,
											self.enum.SQL.sql_org_category_exists.value,
											self.enum.SQL.sql_org_category_insert.value, self.category_col)

			# save all the related entities
			# self.save_teams(rs_json['featured_team'], self.org_uuid, TeamStatus.Featured.value)
			# self.save_teams(rs_json['current_team'], self.org_uuid, TeamStatus.Current.value)
			# self.save_teams(rs_json['past_team'], self.org_uuid, TeamStatus.Past.value)
			# self.save_teams(rs_json['board_members_and_advisors'], self.org_uuid, TeamStatus.Board.value)


			# self.save_investments_invested_in(rs_json['investments'])

			# self.save_relational_entity(rs_json['sub_organizations'], self.org_uuid, self.sql_sub_organization_insert)

			# if rs_json[self.enum.CBDict.offices.value][self.enum.CBDict.items] is not None:
			# 	self.save_relational_entity(rs_json[self.enum.CBDict.offices.value], self.org_uuid, self.enum.SQL.sql_offices_exists, self.sql_offices_insert, self.office_col)

			# self.save_relational_entity(rs_json['founders'], self.org_uuid, self.sql_founders_insert)
			# self.save_relational_entity(rs_json['acquisitions'], self.org_uuid, self.sql_acquisition_insert)
			# self.save_relational_entity(rs_json['acquired_by'], self.org_uuid, self.sql_acquired_insert)
			# self.save_relational_entity(rs_json['ipo'], self.org_uuid, self.sql_ipo_insert)
			# self.save_relational_entity(rs_json['funds'], self.org_uuid, self.sql_funds_insert)
			# self.save_relational_entity(rs_json['websites'], self.org_uuid, self.sql_websites_insert)
			# self.save_relational_entity(rs_json['images'], self.org_uuid, self.sql_image_insert)
			# self.save_relational_entity(rs_json['news'], self.org_uuid, self.sql_news_insert)

				db.execute(self.orgs_summary_update.format(self.org_uuid))
			else:
				print('organization already exists.')
示例#20
0
    def df_to_db(self,
                 df,
                 sql_config_header,
                 remove_single_quotes=True,
                 return_vals=False,
                 clean_numeric_cols=False):

        df_headers, df_qmarks, df_vals = self.get_sql_params(
            df, remove_single_quotes=remove_single_quotes)
        df_header_str = self.get_header_str(df_headers)
        df_sql = CM.get_config("config.ini", "sql_queries", sql_config_header)
        df_sql = df_sql.replace("WHAT_HEADERS",
                                df_header_str).replace("WHAT_VALUES",
                                                       df_qmarks)

        if clean_numeric_cols:
            for lst in df_vals:
                for i in range(len(lst)):
                    element = lst[i]
                    try:
                        if str(element).lower() == "nan" or str(
                                element) == "0000-00-00 00:00:00" or str(
                                    element) == '':
                            new_val = None
                            if type(lst) == tuple:
                                lst = self.replace_tuple_val_at_index(
                                    lst, i, new_val)
                        if np.dtype(element) == 'int64':
                            new_val = int(str(lst[i]))
                            if type(lst) == tuple:
                                lst = self.replace_tuple_val_at_index(
                                    lst, i, new_val)
                    except AttributeError:
                        continue
                    except TypeError:
                        continue
                    except ValueError:
                        continue
                if len(df_vals) == 1 and type(df_vals[0]) == tuple:
                    df_vals[0] = lst

        DB.bulk_insert(df_sql, df_vals)

        if return_vals:
            return df_vals
示例#21
0
	def update_raw_company(self):
		dfnew, dfdc, _ = self.get_company()
		raw_company = self.generate_basic_name(dfnew)
		dim_company = self.generate_basic_name(dfdc) if len(dfdc) > 0 else None

		for index, com in raw_company.iterrows():
			try:
				company_name = com['BasicName']
				print(company_name)
				if len(dim_company[dim_company.BasicName == company_name].CompanyID.values) > 0:
					companyid = dim_company[dim_company.BasicName == company_name].CompanyID.values[0]
					if companyid > 0:
						sql_update = sql.sql_update.value.format('BAP.QuarterlyCompanyData', 'CompanyID', companyid, 'ID', com.ID)
						print(sql_update)
						db.execute(sql_update)
				else:
					print('{} >> {}'.format(com.ID, com.CompanyName))
			except Exception as ex:
				print('UPDATE ISSUE: {}'.format(ex))
示例#22
0
	def insert_dim_company_source(self, new_company):
		try:
			date_time = str(dt.datetime.utcnow())[:-3]
			self.dim_company_source_id = self.get_table_seed('Reporting.DimCompanySource', 'SourceCompanyID') + 1
			dc = dict()
			dc['aSourceID'] = self.dim_company_source_id
			dc['bCompanyID'] = self.dim_company_id
			dc['cName'] = new_company['Name']
			dc['dSCC'] = None
			dc['eDataSource'] = new_company['DataSource']
			dc['eBatchID'] = new_company['BatchID']
			dc['fCT'] = None
			dc['gModified'] = date_time
			dc['hCreated'] = date_time
			df = pd.DataFrame.from_dict([dc], orient='columns')
			values = CM.df_list(df)
			db.bulk_insert(sql.sql_dim_company_source_insert.value, values)
		except Exception as ex:
			print(ex)
示例#23
0
    def check_qs_exist(self, survey_id):

        sql = CM.get_config("config.ini", "sql_queries",
                            "check_questions_exist")
        sql = sql.replace("WHAT_SURVEY_ID", str(survey_id))
        check = DB.pandas_read(sql)

        if check.iloc[0][0]:
            return True
        else:
            return False
示例#24
0
    def preprocess():
        # # CLEANING
        # Website Cleaning
        # Where all the fields are 'n/a' 'NA' etc need to change them to null
        db.execute(
            "UPDATE MDC_DEV.dbo.ProcessedVenture "
            "SET Website = NULL WHERE Website ='na' OR Website = 'n/a' OR Website = '-' "
            "OR Website = 'http://' OR Website = 'no website' OR Website = '--' "
            "OR Website = 'http://N/A' OR Website = 'no data' OR Website = 'http://www.facebook.com' "
            "OR Website = '0' OR Website = 'http://www.nowebsite.co'"
            "OR Website = 'http://N/A - Development stage.' OR Website = ' -'"
            "OR Website = 'http://NOT ON WEB' OR Website = 'http://none'"
            "OR Website = 'http://coming soon' OR Website = 'http://not.yet' "
            "OR Website = 'http://no website' OR Website = 'http://not yet' "
            "OR Website = 'none' OR Website = 'http://NA'OR Website = 'tbd' "
            "OR Website = 'https' OR Website = 'http://www.nowebsite.com' "
            "OR Website = 'http://nowebsite.com' OR Website = 'http://Nowebsiteyet' "
            "OR Website = 'Coming soon' OR Website = 'not set up yet' OR Website = 'http://www.yahoo.com' "
            "OR Website = 'http://under construction' OR Website = 'http://www.nwebsite.com' "
            "OR Website = 'http://www.google.com' OR Website = 'http://www.google.ca' OR Website = 'youtube.com'"
        )

        # # Phone Cleaning
        db.execute(
            "UPDATE MDC_DEV.dbo.ProcessedVenture SET Phone = NULL WHERE LEN(Phone)<10 "
        )

        # # Name Cleaning
        db.execute(
            "DELETE FROM MDC_DEV.dbo.ProcessedVenture WHERE Name LIKE '%communitech%' OR Name LIKE '%company:%' "
            "OR Name LIKE '%nwoic%' OR Name LIKE '%riccentre%' OR Name LIKE '%sparkcentre%'OR Name LIKE '%venture%' "
            "OR Name LIKE '%Wetch_%' OR Name LIKE '%testBAP%' OR Name LIKE '%SSMIC%' OR Name LIKE '%Techalliance%' "
            "OR Name LIKE '%RICC_%' OR Name LIKE '%_anon_%' OR Name LIKE '%InnovationFactory_%' "
            "OR Name LIKE '%InvestOttawa_%' OR Name LIKE '%Québec inc%' OR Name LIKE '%Haltech_%' "
            "OR Name LIKE '%InnovNiag_survey%' OR Name LIKE '%NOIC%'")
示例#25
0
    def write_results(clustered_dupes_list):
        """
        Load finalized clusters into EntityMap table
        :param clustered_dupes_list: list of tuples returned from clustering
        :return: number of duplicate sets
        """
        # We now have a sequence of tuples of company IDs that dedupe believes
        # all refer to the same entity. We write this out onto an entity map table
        db.execute("DELETE FROM MDC_DEV.dbo.EntityMap")

        print('Populating EntityMap table.')

        sql = 'INSERT INTO MDC_DEV.dbo.EntityMap (ID, CanonID, ClusterScore) VALUES (?,?,?)'
        values = []
        cluster_id = 1
        for cluster, scores in clustered_dupes_list:
            for id, score in zip(cluster, scores):
                values.append([int(id), int(cluster_id), float(score)])
            cluster_id = cluster_id + 1

        db.bulk_insert(sql, values)

        db.execute("UPDATE MDC_DEV.dbo.EntityMap SET Name =  p.Name, BatchID = p.BatchID, "
                   "Description = p.Description, Website = p.Website, Email = p.Email, "
                   "Phone = p.Phone, Address = p.Address "
                   "FROM MDC_DEV.dbo.EntityMap AS a INNER JOIN MDC_DEV.dbo.ProcessedVenture AS p "
                   "ON a.ID = p.ID")

        return len(clustered_dupes_list)
示例#26
0
    def remove_false_positives():
        """Consult MatchingFalsePositives and remove known false positive clusters from EntityMap"""

        db.execute("SELECT * FROM MDC_DEV.dbo.EntityMap ORDER BY CanonID")
        entity_map = db.pandas_read("SELECT * FROM MDC_DEV.dbo.EntityMap").set_index('ID').to_dict('index')
        clusters = []
        for index1, val1 in entity_map.items():
            for index2, val2 in entity_map.items():
                if val1['CanonID'] == val2['CanonID'] and index1 != index2:
                    clusters.append([index1, index2])
                    break

        fal_pos = db.pandas_read("SELECT * FROM MDC_DEV.dbo.MatchingFalsePositives").to_dict('index')
        remove = []
        for cluster in clusters:
            for i, v in fal_pos.items():
                if cluster[0] == v['ID'] and cluster[1] == v['FalseID']:
                    remove.append([cluster[0]])
                    remove.append([cluster[1]])
                    break
                else:
                    continue

        sql = 'DELETE FROM MDC_DEV.dbo.EntityMap WHERE ID = (?)'
        db.bulk_insert(sql, remove)
示例#27
0
    def block(self, selection):
        """
        :param deduper: deduper object created in training
        :param selection: sql statement selecting all relevant columns to use in deduplication
        :return: None
        """
        # If dedupe learned a Index Predicate, we have to take a pass
        # through the data and create indices.
        for field in self.deduper.blocker.index_fields:
            df = db.pandas_read("SELECT DISTINCT {field} FROM MDC_DEV.dbo.ProcessedVenture "
                                "WHERE {field} IS NOT NULL".format(field=field))
            dataset = [tuple(x) for x in df.values]
            field_data = set(row[0] for row in dataset)
            self.deduper.blocker.index(field_data, field)

        # Now we are ready to write our blocking map table by creating a
        # generator that yields unique `(BlockKey, ID)` tuples.
        db.execute("DELETE FROM MDC_DEV.dbo.BlockingMap")

        df = db.pandas_read(selection).set_index('ID').to_dict('index')
        b_data = self.deduper.blocker(df)
        sql = 'INSERT INTO MDC_DEV.dbo.BlockingMap VALUES (?,?)'

        print('Populating BlockingMap... ')
        # Chunk the blocked data into groups of 30,000 blocks to be inserted in the BlockingMap
        size = 30000
        main_list = list(b_data)
        b_data = None
        chunks = [main_list[x:x + size] for x in range(0, len(main_list), size)]
        main_list = None
        for chunk in chunks:
            db.bulk_insert(sql, chunk)

        self.deduper.blocker.resetIndices()
示例#28
0
    def get_campaigns(self, api_token, survey_id, session_variables,
                      surveys_df):

        if survey_id == 'w':
            while type(survey_id) != int:
                try:
                    survey_id = int(
                        input(
                            "Enter ID of survey that you would like to retrieve campaign data for: "
                        ))
                    if self.return_to_main(survey_id) == 1:
                        return
                    survey_id = self.validate_survey_id(
                        survey_id, session_variables, api_token, surveys_df)
                    survey_id = self.validate_survey_id(
                        survey_id, session_variables, api_token, surveys_df)
                except ValueError:
                    continue

        campaigns_df = sg_campaign.sg_campaigns_df(survey_id, api_token)
        print(campaigns_df)
        campaigns_df["id"] = campaigns_df["id"].apply(pd.to_numeric,
                                                      errors='ignore')

        # remove campaigns from df that are already in DB
        c_sql = CM.get_config("config.ini", "sql_queries",
                              "campaigns_for_survey")
        c_sql = c_sql.replace("WHAT_SURVEY_ID", str(survey_id))
        db_cmpgns = DB.pandas_read(c_sql)
        if db_cmpgns is not None:
            db_cmpgns = db_cmpgns.apply(pd.to_numeric, errors='ignore')

        cmpgns_not_in_db = pd.merge(campaigns_df,
                                    db_cmpgns,
                                    how='left',
                                    indicator=True,
                                    on="id")
        cmpgns_not_in_db2 = cmpgns_not_in_db[cmpgns_not_in_db['_merge'] ==
                                             'left_only'].drop("_merge",
                                                               axis=1)
        # cmpgns_not_in_db2 = cmpgns_not_in_db2.apply(pd.to_numeric, errors='ignore')

        # insert campaigns into DB
        if len(cmpgns_not_in_db2) > 0:
            insert_cmpgns_sql = "insert_campaigns"

            self.df_to_db(cmpgns_not_in_db2,
                          insert_cmpgns_sql,
                          remove_single_quotes=False,
                          clean_numeric_cols=True)

        return campaigns_df
示例#29
0
	def insert_dim_company(self, new_company):
		try:
			self.dim_company_id = self.get_table_seed('Reporting.DimCompany', 'CompanyID') + 1
			date_time = str(dt.datetime.utcnow())[:-3]
			dc = dict()
			dc['aCompanyID'] = self.dim_company_id
			dc['bName'] = new_company['Name']
			dc['cDescription'] = None
			dc['dPhone'] = None
			dc['ePhone2'] = None
			dc['fFax'] = None
			dc['gEmail'] = None
			dc['hWebsite'] = new_company['Website']
			dc['iCompanyType'] = None
			dc['jBatchID'] = new_company['BatchID']
			dc['kModifiedDate'] = date_time
			dc['lCreatedDate'] = date_time
			df = pd.DataFrame.from_dict([dc], orient='columns')
			values = CM.df_list(df)
			db.bulk_insert(sql.sql_dim_company_insert.value, values)
		except Exception as es:
			print(es)
示例#30
0
    def get_load_order(self, schema):
        """
        Takes name of schema, returns DataFrame
        with load order in first column.
        :param schema:
        :return DataFrame:
        """

        schema_str = "'" + str(schema) + "'"
        sql_str = CM.get_config("config.ini", "dependency_query",
                                "load_order_query")
        sql_str = sql_str.replace("WHAT_SCHEMA", schema_str)
        load_order = DAL.pandas_read(sql_str)
        return load_order