Exemplo n.º 1
0
postgreSql_selectQuery="SELECT company FROM gh.ctrs_raw ;"
inputRaw=ossPyFuncs.queryToPDTable(postgreSql_selectQuery)

#perform sql query to get company column
postgreSql_selectQuery="SELECT local_language_abbreviation FROM gleif.legal_entities;"
legalEntitiesRaw=ossPyFuncs.queryToPDTable(postgreSql_selectQuery)
longLine=legalEntitiesRaw['local_language_abbreviation'].str.cat(sep=';')
longLineSeparated=pd.DataFrame(longLine.split(';'))
uniqueFrame=pd.DataFrame(longLineSeparated[0].unique())
#uniqueFrame=pd.DataFrame(uniqueFrame[0][~uniqueFrame[0].str.contains('(?i)^co\.$|^co$')]).reset_index(drop=True)
#uniqueFrame=pd.DataFrame(uniqueFrame[0][~uniqueFrame[0].str.contains('(?i)^co\.$|^co$')]).reset_index(drop=True)

sqlQueryFormattedFrame=pd.DataFrame('(?i)\\b'+uniqueFrame[0].astype(str)+'\\b')

inputColumn, eraseList=ossPyFuncs.eraseFromColumn(inputRaw['company'],sqlQueryFormattedFrame)

eraseList.sort_values(by=['changeNum'],ascending=False,inplace=True)
eraseList.reset_index(drop=True,inplace=True)

longLine=barAbbreviations[0].str.cat(sep='|')




#formulate a good regex expression
currentRegex=re.compile('(?i)\\b'+longLine+'\\b')
    
#get all company listings that feature the current company string
test5=uniqueFrame[uniqueFrame[0].str.contains('(цак)\1{9,}')]
Exemplo n.º 2
0
import wordcloud
import re
import matplotlib.pyplot as plt
import os

#perform sql query to get company column
postgreSql_selectQuery = "SELECT company FROM gh.ctrs_raw ;"
inputRaw = ossPyFuncs.queryToPDTable(postgreSql_selectQuery)

#obtain the eralse list
currentDir = os.path.dirname('ossPyFuncs.py')
eraseList = pd.read_csv(os.path.join(currentDir, 'keyFiles/eraseStrings.csv'),
                        quotechar="'")
#apply the erase list
semiCleanedOutput = pd.DataFrame(
    ossPyFuncs.eraseFromColumn(inputRaw['company'], eraseList))

#cat together all user's workplace names (note, we are not applying unique first)
longString = semiCleanedOutput['company'].str.cat(sep=' ')

#separate each word into a extremely long list
longStringSeparated = longString.split(' ')

#turn it into a dataframe
uniqueSubTokenFrame = pd.DataFrame(longStringSeparated)

#get the count on that column
columnUniqueCounts = uniqueSubTokenFrame.iloc[:, 0].value_counts()
#convert that output to a proper table
tableUniqueCounts = columnUniqueCounts.reset_index()
tableUniqueCounts.rename(columns={0: "count", "index": "token"}, inplace=True)
Exemplo n.º 3
0
import wordcloud
import re
import matplotlib.pyplot as plt
import os

#perform sql query to get company column
postgreSql_selectQuery="SELECT company FROM gh.ctrs_raw ;"
inputRaw=ossPyFuncs.queryToPDTable(postgreSql_selectQuery)
#force case insensitivity
lowerInput=pd.DataFrame(inputRaw['company'].str.lower())

#obtain the eralse list
currentDir=os.path.dirname('ossPyFuncs.py')
eraseList=pd.read_csv(os.path.join(currentDir,'keyFiles/eraseStrings_v6.csv'),quotechar="'")
#apply the erase list
semiCleanedOutput=ossPyFuncs.eraseFromColumn(lowerInput['company'],eraseList)

#replace interior spaces and periods (which the wordcloud splits at)
spacesReplaced=semiCleanedOutput.str.replace(' ','_')
periodsReplaced=spacesReplaced.str.replace('\.','_')


#turn that output into a long string
longString=periodsReplaced.str.cat(sep=' ')

#generate a wordcloud and convert it to svg
outcloud=wordcloud.WordCloud(width=2000, height=1000, max_words=2000).generate(longString)
svgCloud=outcloud.to_svg()

#save it down as an svg
svgOut=open(os.path.join(currentDir,'figures/wordcloud.svg'),"w")
Exemplo n.º 4
0
import nltk

 #perform sql query to get company column
postgreSql_selectQuery="SELECT company FROM gh.ctrs_raw ;"
inputRaw=ossPyFuncs.queryToPDTable(postgreSql_selectQuery)


currentDir=os.path.dirname('ossPyFuncs.py')
replaceList=pd.read_csv(os.path.join(currentDir,'keyFiles/expandAbrevs.csv'),quotechar="'",header=None)
inputColumn, replaceList=ossPyFuncs.expandFromColumn(inputRaw['company'],replaceList)

#obtain the eralse list
currentDir=os.path.dirname('ossPyFuncs.py')
eraseList=pd.read_csv(os.path.join(currentDir,'keyFiles/eraseStrings_v6.csv'),quotechar="'",header=None)
#apply the erase list
semiCleanedOutput=pd.DataFrame(ossPyFuncs.eraseFromColumn(inputRaw['company'],eraseList))

#cat together all user's workplace names (note, we are not applying unique first)
longString=inputRaw['company'].str.cat(sep=' ')

#separate each word into a extremely long list
longStringSeparated=longString.split(' ')

#turn it into a dataframe
uniqueSubTokenFrame=pd.DataFrame(longStringSeparated)

#get the count on that column
columnUniqueCounts=uniqueSubTokenFrame.iloc[:,0].value_counts()
#convert that output to a proper table
tableUniqueCounts=columnUniqueCounts.reset_index()
#reset the names
Exemplo n.º 5
0
import matplotlib.pyplot as plt
import os

#form and perform the query
postgreSql_selectQuery = "SELECT login, company FROM gh.ctrs_raw ;"
result = ossPyFuncs.queryToPDTable(postgreSql_selectQuery)

#obtain the eralse list
currentDir = os.path.dirname('ossPyFuncs.py')
eraseList = pd.read_csv(os.path.join(currentDir,
                                     'keyFiles/eraseStrings_v6.csv'),
                        quotechar="'",
                        header=None)
#apply the erase list
semiCleanedOutput = pd.DataFrame(
    ossPyFuncs.eraseFromColumn(eraseList['company'], eraseList))
#apply a lower to increase convergence/overlap
lowerInput = pd.DataFrame(semiCleanedOutput['company'].str.lower())

#get the unique counts
companyCounts = lowerInput['company'].value_counts()

#establish the binvals
binVals = np.asarray([0, 1, 5, 10, 20, 50, 100, 200, np.max(companyCounts)])

#iterate to sum the number of employees meeting the criterion
binSum = np.zeros([len(binVals) - 1, 1])
for iBins in range(len(binVals) - 1):
    binSum[iBins] = sum(companyCounts[np.logical_and(
        companyCounts > binVals[iBins], companyCounts <= binVals[iBins + 1])])