Exemplo n.º 1
0
import re
import matplotlib.pyplot as plt
import os
import numpy as np
import seaborn as sns

tpis=sns.load_dataset("tips")


#perform sql query to get company column
postgreSql_selectQuery="SELECT company FROM gh.ctrs_raw ;"
inputRaw=ossPyFuncs.queryToPDTable(postgreSql_selectQuery)

currentDir=os.path.dirname('ossPyFuncs.py')
replaceList=pd.read_csv(os.path.join(currentDir,'keyFiles/expandAbrevs.csv'),quotechar="'",header=None)
semiCleanedOutput=pd.DataFrame(ossPyFuncs.expandFromColumn(inputRaw['company'],replaceList))

#obtain the eralse list
currentDir=os.path.dirname('ossPyFuncs.py')
eraseList=pd.read_csv(os.path.join(currentDir,'keyFiles/eraseStrings_v6.csv'),quotechar="'")
#apply the erase list
semiCleanedOutput=pd.DataFrame(ossPyFuncs.eraseFromColumn(semiCleanedOutput['company'],eraseList))

#get the counts for the unique values
tableUniqueFullNameCounts=semiCleanedOutput.iloc[:,0].value_counts()
#convert that output to a proper table
tableUniqueFullNameCounts=tableUniqueFullNameCounts.reset_index()

#rename the columns
tableUniqueFullNameCounts.rename(columns={"company":"count","index":"company"},inplace=True)
Exemplo n.º 2
0
import ossPyFuncs
import pandas as pd
import wordcloud
import re
import matplotlib.pyplot as plt
import os
import nltk

 #perform sql query to get company column
postgreSql_selectQuery="SELECT company FROM gh.ctrs_raw ;"
inputRaw=ossPyFuncs.queryToPDTable(postgreSql_selectQuery)


currentDir=os.path.dirname('ossPyFuncs.py')
replaceList=pd.read_csv(os.path.join(currentDir,'keyFiles/expandAbrevs.csv'),quotechar="'",header=None)
inputColumn, replaceList=ossPyFuncs.expandFromColumn(inputRaw['company'],replaceList)

#obtain the eralse list
currentDir=os.path.dirname('ossPyFuncs.py')
eraseList=pd.read_csv(os.path.join(currentDir,'keyFiles/eraseStrings_v6.csv'),quotechar="'",header=None)
#apply the erase list
semiCleanedOutput=pd.DataFrame(ossPyFuncs.eraseFromColumn(inputRaw['company'],eraseList))

#cat together all user's workplace names (note, we are not applying unique first)
longString=inputRaw['company'].str.cat(sep=' ')

#separate each word into a extremely long list
longStringSeparated=longString.split(' ')

#turn it into a dataframe
uniqueSubTokenFrame=pd.DataFrame(longStringSeparated)
def spaceSymbolRemap(inputColumn):
    """remapps entries with same space and symbol free string to most common element
    
    Keyword arguments:
    inputColumn -- a column from a pandas dataframe, presumably with duplicate 
    entires, as frequency will guide this process.
    space/symbol/case variants of the same string will be remapped to most common element
    """
    import pandas as pd
    import re
    import numpy as np
    import ossPyFuncs

    #get the input column names
    inputColumnName = inputColumn.columns

    #get the unique values (and counts)
    tableUniqueFullNameCounts = inputColumn[inputColumnName[0]].value_counts()
    #convert that output to a proper table
    tableUniqueFullNameCounts = tableUniqueFullNameCounts.reset_index()
    #rename the columns
    tableUniqueFullNameCounts.rename(columns={
        inputColumnName[0]: "count",
        "index": inputColumnName[0]
    },
                                     inplace=True)

    tableUniqueFullNameCounts = tableUniqueFullNameCounts.sort_values(
        by=['count', inputColumnName[0]], ascending=[False, False])

    tableUniqueFullNameCounts = tableUniqueFullNameCounts.reset_index(
        drop=True)

    uniqueNoSpaceSymbol = pd.DataFrame(
        tableUniqueFullNameCounts[inputColumnName[0]].str.replace(
            '[^a-zA-Z0-9]', ''))

    tableUniqueFullNameCounts['remapping'] = ''
    #iterate across entries with guesses
    for index, row in tableUniqueFullNameCounts.iterrows():
        #set current entry number
        currentEntry = tableUniqueFullNameCounts[inputColumnName[0]].loc[index]
        #get the lowercase form of it
        #currentLower=currentEntry.lower()
        #extract current string from company vector
        currentNoSpaceOrSymbol = re.sub('\\W', '', currentEntry)
        #extract what may be a list of guesses
        noSpaceSymbolMatches = uniqueNoSpaceSymbol[
            inputColumnName[0]].str.contains('(?i)\\b' +
                                             currentNoSpaceOrSymbol + '\\b')
        #find the counts of the entires that match up with this, use the wisdom of the crowds
        currentCounts = tableUniqueFullNameCounts['count'].loc[
            noSpaceSymbolMatches]
        #find the listing of the label with the max frequency
        #make an array of it
        indexFrame = currentCounts.reset_index()
        #find the index
        currentIndex = indexFrame.loc[(
            indexFrame['count'] == np.max(currentCounts))]
        if (not index == currentIndex['index'].iloc[0]
            ) and len(currentNoSpaceOrSymbol) > 0:
            #extract the name that is to be remapped to
            mappedName = tableUniqueFullNameCounts[inputColumnName[0]].loc[
                currentIndex['index'].iloc[0]]
            #place it in the table
            tableUniqueFullNameCounts.at[index, 'remapping'] = mappedName

        print('Remaping identification complete')

    #find where you need to perform regex replacements
    remapPresent = tableUniqueFullNameCounts['remapping'].str.len() > 0

    #create subtable for things to replace
    replacementSubtable = tableUniqueFullNameCounts.loc[remapPresent]

    #use the replacement function to replace the relevant items
    fixedList, fixedReport = ossPyFuncs.expandFromColumn(
        inputColumn, pd.DataFrame(replacementSubtable['company', 'remapping']))

    print('remapping complete')
    return fixedList, fixedReport