Python getDBConnection示例，src.git.util.mysqldatabase.getDBConnection Python示例

示例#1

0

显示文件

def cleanKeyWords():
    """
    inserts the distinct words and occurence into the table
    """
    conn = db.getDBConnection()
    cursor = conn.cursor()
    sql = "select word from keywords1"
    rows = db.executeSQL(conn, sql)
    wordMap = {}
    for row in rows:
        word = row[0]
        if word in wordMap:
            count = wordMap.get(word)
            wordMap[word] = count + 1
        else:
            wordMap[word] = 1
    counter = 1
    for key in wordMap.keys():
        if (util.emptyString(key) == 0):
            sql1 = "insert into clean_keywords1 values (" + str(
                counter) + ",'" + key + "'," + str(
                    wordMap[key]) + "," + str(0) + ")"
            print sql1
            cursor.execute(sql1)
            conn.commit()
            counter = counter + 1

示例#2

0

显示文件

文件： processdata.py 项目： rajuch/Kmeans-Clustering

def getdata():
    """
    retrieves the data from repository table
    removes the special characters and stop words and does the stemming(using nltk package)
    """
    conn=db.getDBConnection()
    cursor = conn.cursor()
    global stopWordSet
    sql = "select id, description from repository"
    rows = db.executeSQL(conn, sql)
    counter=1
    wnl = WordNetLemmatizer()
    for row in rows:
        id = row[0]
        desc= row[1]
        #print desc
        if desc is not None:
            desc=desc.replace('-',' ').replace(',',' ').replace('/',' ').replace('.',' ').replace('_',' ')
            desc = desc.lower()
            desc = re.sub('[^a-z0-9 ]','',desc)
            keywords = desc.split(" ")
            for word in keywords:
                #word = porter.stem(word.strip())
                word=wnl.lemmatize(word.strip())
                if word not in stopWordSet:
                    sql1 = "insert into keywords1 values("+str(counter)+",'"+word+"',"+str(id)+ ',' + str(0) + ")"
                    print sql1
                    cursor.execute(sql1)
                    conn.commit()
                    counter = counter+1

示例#3

0

显示文件

def getdata():
    """
    retrieves the data from repository table
    removes the special characters and stop words and does the stemming(using nltk package)
    """
    conn = db.getDBConnection()
    cursor = conn.cursor()
    global stopWordSet
    sql = "select id, description from repository"
    rows = db.executeSQL(conn, sql)
    counter = 1
    wnl = WordNetLemmatizer()
    for row in rows:
        id = row[0]
        desc = row[1]
        #print desc
        if desc is not None:
            desc = desc.replace('-', ' ').replace(',', ' ').replace(
                '/', ' ').replace('.', ' ').replace('_', ' ')
            desc = desc.lower()
            desc = re.sub('[^a-z0-9 ]', '', desc)
            keywords = desc.split(" ")
            for word in keywords:
                #word = porter.stem(word.strip())
                word = wnl.lemmatize(word.strip())
                if word not in stopWordSet:
                    sql1 = "insert into keywords1 values(" + str(
                        counter) + ",'" + word + "'," + str(id) + ',' + str(
                            0) + ")"
                    print sql1
                    cursor.execute(sql1)
                    conn.commit()
                    counter = counter + 1

示例#4

0

显示文件

def preapreData():
    """
    prepares the sparse matrix for kmeans
    """
    con = db.getDBConnection()
    rowCount = 0
    global repoLangMap
    for repoId in repoList:
        lang = repoLangMap[int(repoId)]
        if lang != '' and lang is not None:
            langIndex = langMap[lang]
            S[rowCount, langIndex] = 10
        sql2 = 'select word, tf from keywords1 where repo_id= ' + str(repoId)
        #print sql2
        res = db.executeSQL(con, sql2)
        #print len(res), repoId

        for row in res:
            word = row[0]
            tf = int(row[1])
            if word in wordDFMap:
                val = wordDFMap[word]
                index = int(val.split(":")[0])
                df = int(val.split(":")[1])
                print rowCount, index
                S[rowCount, index] = tf * math.log(float(noOfRepos / df))

        rowCount = rowCount + 1

示例#5

0

显示文件

文件： kmeans.py 项目： rajuch/Kmeans-Clustering

def preapreData():
    """
    prepares the sparse matrix for kmeans
    """
    con = db.getDBConnection()
    rowCount =0
    global repoLangMap    
    for repoId in repoList:
        lang = repoLangMap[int(repoId)]
        if lang!='' and lang is not None:
            langIndex=langMap[lang]
            S[rowCount, langIndex] = 10 
        sql2 = 'select word, tf from keywords1 where repo_id= '+ str(repoId)
        #print sql2
        res = db.executeSQL(con, sql2)
        #print len(res), repoId
        
        for row in res:
            word = row[0]
            tf = int(row[1])
            if word in wordDFMap:
                val=wordDFMap[word]
                index = int(val.split(":")[0])
                df = int(val.split(":")[1])
                print rowCount, index
                S[rowCount, index] = tf * math.log(float(noOfRepos/df))
           
        rowCount = rowCount + 1

示例#6

0

显示文件

def getRepos():
    """
    retrieve the repository ids, languages from the database
    """
    sql = 'select id, language from repository order by id ASC'
    con = db.getDBConnection()
    rows = db.executeSQL(con, sql)
    repoList = []
    global repoLangMap
    for row in rows:
        repoList.append(row[0])
        repoLangMap[row[0]] = row[1]
    return repoList

示例#7

0

显示文件

文件： kmeans.py 项目： rajuch/Kmeans-Clustering

def getRepos():
    """
    retrieve the repository ids, languages from the database
    """
    sql = 'select id, language from repository order by id ASC'
    con = db.getDBConnection()
    rows=db.executeSQL(con, sql);
    repoList = []
    global repoLangMap
    for row in rows:
        repoList.append(row[0])
        repoLangMap[row[0]] = row[1]
    return repoList

示例#8

0

显示文件

def getCleanKeywords():
    """
    retrieve the words, document frequency>5 from the database
    """
    con = db.getDBConnection()
    sql = 'select word, df from clean_keywords1 where df>5'
    cleanWordRows = db.executeSQL(con, sql)
    counter = 0
    wordDFMap = {}
    for row in cleanWordRows:
        word = row[0]
        df = row[1]
        wordDFMap[word] = str(counter) + ":" + str(df)
        counter = counter + 1
    return wordDFMap

示例#9

0

显示文件

文件： kmeans.py 项目： rajuch/Kmeans-Clustering

def getCleanKeywords():
    """
    retrieve the words, document frequency>5 from the database
    """
    con = db.getDBConnection()
    sql = 'select word, df from clean_keywords1 where df>5'
    cleanWordRows = db.executeSQL(con, sql)
    counter =0
    wordDFMap ={}
    for row in cleanWordRows:
        word = row[0]
        df = row[1]
        wordDFMap[word] = str(counter)+":"+str(df)
        counter = counter+1
    return wordDFMap

示例#10

0

显示文件

def readFiles():
    """
    read json files and process it
    """
    conn= db.getDBConnection()   
    for zippedFile in os.listdir("."):
        print zippedFile
        try:
            f = gzip.open(zippedFile, 'rb')
            file_content = f.read()
            parseJson(file_content, conn)
        except Exception as e:
            print 'Error in line:'+str(sys.exc_traceback.tb_lineno)
            pass
        finally:
            f.close()

示例#11

0

显示文件

def getLanguage():
    """
    retireves the distinct languages from the database
    """
    con = db.getDBConnection()
    sql = 'select distinct language from repository'
    res = db.executeSQL(con, sql)
    global noOfWords
    global langMap
    index = noOfWords
    print 'languages index' + str(index)
    for row in res:
        lang = row[0]
        if lang != '':
            print lang
            langMap[lang] = index
            index += 1

示例#12

0

显示文件

文件： kmeans.py 项目： rajuch/Kmeans-Clustering

def getLanguage():
    """
    retireves the distinct languages from the database
    """
    con = db.getDBConnection()
    sql = 'select distinct language from repository'
    res = db.executeSQL(con, sql)
    global noOfWords
    global langMap
    index=noOfWords
    print 'languages index' + str(index)
    for row in res:
        lang = row[0]
        if lang !='':
            print lang
            langMap[lang] = index
            index += 1

示例#13

0

显示文件

def calculateTFIDF():
    """
    calculates the term frequency- inverse document frequency and stores in the table
    """
    conn = db.getDBConnection()
    cursor = conn.cursor()
    sql = "select word from clean_keywords1"
    print sql
    rows = db.executeSQL(conn, sql)
    wordTFMap = {}
    wordDFMap = {}
    for row in rows:
        word = row[0]
        sql1 = "select repo_id from keywords1 where word='" + word + "'"
        print sql1
        res = db.executeSQL(conn, sql1)
        for row1 in res:
            repoId = row1[0]
            key = word + ':' + str(repoId)
            if key in wordTFMap:
                tfCount = wordTFMap[key]
                wordTFMap[key] = tfCount + 1
            else:
                wordTFMap[key] = 1
                if word in wordDFMap:
                    dfCount = wordDFMap[word]
                    wordDFMap[word] = dfCount + 1
                else:
                    wordDFMap[word] = 1

    for key in wordDFMap.keys():
        sql = 'update clean_keywords1 set df=' + str(
            wordDFMap[key]) + " where word='" + key + "'"
        print sql
        cursor.execute(sql)
        conn.commit()

    for key in wordTFMap.keys():
        row = key.split(":")
        sql = 'update keywords1 set tf=' + str(
            wordTFMap[key]
        ) + " where word='" + row[0] + "' and repo_id=" + str(row[1])
        print sql
        cursor.execute(sql)
        conn.commit()

示例#14

0

显示文件

文件： processdata.py 项目： rajuch/Kmeans-Clustering

def calculateTFIDF():
    """
    calculates the term frequency- inverse document frequency and stores in the table
    """
    conn = db.getDBConnection()
    cursor = conn.cursor()
    sql = "select word from clean_keywords1"
    print sql
    rows = db.executeSQL(conn, sql)
    wordTFMap = {}
    wordDFMap = {}
    for row in rows:
        word=row[0]
        sql1 = "select repo_id from keywords1 where word='"+word+"'"
        print sql1
        res=db.executeSQL(conn, sql1)
        for row1 in res:
            repoId = row1[0]
            key = word + ':'+ str(repoId)
            if key in wordTFMap:
                tfCount = wordTFMap[key]
                wordTFMap[key] = tfCount+1
            else:
                wordTFMap[key] = 1
                if word in wordDFMap:
                    dfCount = wordDFMap[word]
                    wordDFMap[word] = dfCount+1
                else:
                    wordDFMap[word] =1
    
    for key in wordDFMap.keys():
        sql = 'update clean_keywords1 set df='+str(wordDFMap[key]) + " where word='"+key+"'"
        print sql
        cursor.execute(sql)
        conn.commit() 
        
    for key in wordTFMap.keys():
        row=key.split(":")
        sql = 'update keywords1 set tf='+str(wordTFMap[key])+" where word='"+row[0]+"' and repo_id="+str(row[1])
        print sql
        cursor.execute(sql)
        conn.commit()

示例#15

0

显示文件

文件： processdata.py 项目： rajuch/Kmeans-Clustering

def cleanKeyWords():
    """
    inserts the distinct words and occurence into the table
    """
    conn = db.getDBConnection()
    cursor = conn.cursor()
    sql = "select word from keywords1"
    rows= db.executeSQL(conn, sql)
    wordMap = {} 
    for row in rows:
        word = row[0]
        if word in wordMap:
            count=wordMap.get(word)
            wordMap[word] = count+1
        else:
            wordMap[word] = 1
    counter = 1
    for key in wordMap.keys():
        if (util.emptyString(key) ==0):
            sql1="insert into clean_keywords1 values ("+str(counter)+",'"+key+"',"+str(wordMap[key])+","+str(0)+")"
            print sql1
            cursor.execute(sql1)
            conn.commit()
            counter = counter+1