def composeWorkplaceOntology(): import ossPyFuncs import pandas as pd postgreSql_selectQuery = "SELECT * FROM us_gov_manual.us_govman_2019 ;" govTable = ossPyFuncs.queryToPDTable(postgreSql_selectQuery) postgreSql_selectQuery = "SELECT institution FROM hipolabs.universities ;" univTable = ossPyFuncs.queryToPDTable(postgreSql_selectQuery) combinedSeries = [govTable['AgencyName'], univTable['institution']] fullWordbank = pd.concat(combinedSeries) wordbankTable = pd.DataFrame(fullWordbank) return wordbankTable
def composeWorkplaceOntology(): """Create a table featuring valid workplace institutions """ import ossPyFuncs import pandas as pd #mysql query to extract full table from government organizations #certian table columns feature capital letters which cases uproblems postgreSql_selectQuery = "SELECT * FROM us_gov_manual.us_govman_2019 ;" #pass querry and obtain table govTable = ossPyFuncs.queryToPDTable(postgreSql_selectQuery) #mysql query to obtain academic instutions postgreSql_selectQuery = "SELECT institution FROM hipolabs.universities ;" #pass querry and obtain table univTable = ossPyFuncs.queryToPDTable(postgreSql_selectQuery) postgreSql_selectQuery = "SELECT company FROM forbes.fortune2018_us1000;" businesses1 = ossPyFuncs.queryToPDTable(postgreSql_selectQuery) postgreSql_selectQuery = "SELECT company FROM forbes.fortune2019_us1000;" businesses2 = ossPyFuncs.queryToPDTable(postgreSql_selectQuery) postgreSql_selectQuery = "SELECT company FROM forbes.fortune2020_global2000;" businesses3 = ossPyFuncs.queryToPDTable(postgreSql_selectQuery) #combine theinsitutions into a vector combinedSeries = [ govTable['AgencyName'], univTable['institution'], businesses1['company'], businesses2['company'], businesses3['company'] ] #turn the multi item vector into a single series fullWordbank = pd.concat(combinedSeries) #turn that series into a pd dataframe wordbankTable = pd.DataFrame(fullWordbank.unique()) return wordbankTable
Created on Thu Jun 25 09:54:55 2020 @author: dnb3k """ import ossPyFuncs import pandas as pd import wordcloud import re import matplotlib.pyplot as plt import os import numpy as np import seaborn as sns postgreSql_selectQuery="SELECT company FROM gh.ctrs_raw ;" inputRaw=ossPyFuncs.queryToPDTable(postgreSql_selectQuery) #perform sql query to get company column postgreSql_selectQuery="SELECT local_language_abbreviation FROM gleif.legal_entities;" legalEntitiesRaw=ossPyFuncs.queryToPDTable(postgreSql_selectQuery) longLine=legalEntitiesRaw['local_language_abbreviation'].str.cat(sep=';') longLineSeparated=pd.DataFrame(longLine.split(';')) uniqueFrame=pd.DataFrame(longLineSeparated[0].unique()) #uniqueFrame=pd.DataFrame(uniqueFrame[0][~uniqueFrame[0].str.contains('(?i)^co\.$|^co$')]).reset_index(drop=True) #uniqueFrame=pd.DataFrame(uniqueFrame[0][~uniqueFrame[0].str.contains('(?i)^co\.$|^co$')]).reset_index(drop=True) sqlQueryFormattedFrame=pd.DataFrame('(?i)\\b'+uniqueFrame[0].astype(str)+'\\b') inputColumn, eraseList=ossPyFuncs.eraseFromColumn(inputRaw['company'],sqlQueryFormattedFrame) eraseList.sort_values(by=['changeNum'],ascending=False,inplace=True)
this script generates a table containing the unique sub-tokens (i.e. individual words) found in workplace names @author: dnb3k """ import ossPyFuncs import pandas as pd import wordcloud import re import matplotlib.pyplot as plt import os #perform sql query to get company column postgreSql_selectQuery = "SELECT company FROM gh.ctrs_raw ;" inputRaw = ossPyFuncs.queryToPDTable(postgreSql_selectQuery) #obtain the eralse list currentDir = os.path.dirname('ossPyFuncs.py') eraseList = pd.read_csv(os.path.join(currentDir, 'keyFiles/eraseStrings.csv'), quotechar="'") #apply the erase list semiCleanedOutput = pd.DataFrame( ossPyFuncs.eraseFromColumn(inputRaw['company'], eraseList)) #cat together all user's workplace names (note, we are not applying unique first) longString = semiCleanedOutput['company'].str.cat(sep=' ') #separate each word into a extremely long list longStringSeparated = longString.split(' ')
""" Created on Fri Jun 12 08:16:24 2020 This function creates a plot that depicts the number of people that work at a company that has some number of employees associated with it. @author: dnb3k """ import ossPyFuncs import numpy as np import pandas as pd import matplotlib.pyplot as plt import os #form and perform the query postgreSql_selectQuery = "SELECT login, company FROM gh.ctrs_raw ;" result = ossPyFuncs.queryToPDTable(postgreSql_selectQuery) #obtain the eralse list currentDir = os.path.dirname('ossPyFuncs.py') eraseList = pd.read_csv(os.path.join(currentDir, 'keyFiles/eraseStrings_v6.csv'), quotechar="'", header=None) #apply the erase list semiCleanedOutput = pd.DataFrame( ossPyFuncs.eraseFromColumn(eraseList['company'], eraseList)) #apply a lower to increase convergence/overlap lowerInput = pd.DataFrame(semiCleanedOutput['company'].str.lower()) #get the unique counts companyCounts = lowerInput['company'].value_counts()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Jun 15 14:21:06 2020 @author: dnb3k """ import pandas as pd import ossPyFuncs remapTable = pd.read_csv('workplaceMapping.csv') postgreSql_selectQuery = "SELECT company FROM gh.ctrs_raw ;" inputColumn = ossPyFuncs.queryToPDTable(postgreSql_selectQuery) def remapColumnValuesfromTable(inputColumn, remapTable): import ossPyFuncs import pandas as pd import numpy as np import difflib gitWorkplaceCounts = inputColumn['company'].value_counts() sortedTable = gitWorkplaceCounts.reset_index() sortedTable.rename(columns={ "index": "company name", "company": "count" }, inplace=True)