示例#1
0
def cleanKeyWords():
    """
    inserts the distinct words and occurence into the table
    """
    conn = db.getDBConnection()
    cursor = conn.cursor()
    sql = "select word from keywords1"
    rows = db.executeSQL(conn, sql)
    wordMap = {}
    for row in rows:
        word = row[0]
        if word in wordMap:
            count = wordMap.get(word)
            wordMap[word] = count + 1
        else:
            wordMap[word] = 1
    counter = 1
    for key in wordMap.keys():
        if (util.emptyString(key) == 0):
            sql1 = "insert into clean_keywords1 values (" + str(
                counter) + ",'" + key + "'," + str(
                    wordMap[key]) + "," + str(0) + ")"
            print sql1
            cursor.execute(sql1)
            conn.commit()
            counter = counter + 1
示例#2
0
def getdata():
    """
    retrieves the data from repository table
    removes the special characters and stop words and does the stemming(using nltk package)
    """
    conn=db.getDBConnection()
    cursor = conn.cursor()
    global stopWordSet
    sql = "select id, description from repository"
    rows = db.executeSQL(conn, sql)
    counter=1
    wnl = WordNetLemmatizer()
    for row in rows:
        id = row[0]
        desc= row[1]
        #print desc
        if desc is not None:
            desc=desc.replace('-',' ').replace(',',' ').replace('/',' ').replace('.',' ').replace('_',' ')
            desc = desc.lower()
            desc = re.sub('[^a-z0-9 ]','',desc)
            keywords = desc.split(" ")
            for word in keywords:
                #word = porter.stem(word.strip())
                word=wnl.lemmatize(word.strip())
                if word not in stopWordSet:
                    sql1 = "insert into keywords1 values("+str(counter)+",'"+word+"',"+str(id)+ ',' + str(0) + ")"
                    print sql1
                    cursor.execute(sql1)
                    conn.commit()
                    counter = counter+1
示例#3
0
def getdata():
    """
    retrieves the data from repository table
    removes the special characters and stop words and does the stemming(using nltk package)
    """
    conn = db.getDBConnection()
    cursor = conn.cursor()
    global stopWordSet
    sql = "select id, description from repository"
    rows = db.executeSQL(conn, sql)
    counter = 1
    wnl = WordNetLemmatizer()
    for row in rows:
        id = row[0]
        desc = row[1]
        #print desc
        if desc is not None:
            desc = desc.replace('-', ' ').replace(',', ' ').replace(
                '/', ' ').replace('.', ' ').replace('_', ' ')
            desc = desc.lower()
            desc = re.sub('[^a-z0-9 ]', '', desc)
            keywords = desc.split(" ")
            for word in keywords:
                #word = porter.stem(word.strip())
                word = wnl.lemmatize(word.strip())
                if word not in stopWordSet:
                    sql1 = "insert into keywords1 values(" + str(
                        counter) + ",'" + word + "'," + str(id) + ',' + str(
                            0) + ")"
                    print sql1
                    cursor.execute(sql1)
                    conn.commit()
                    counter = counter + 1
示例#4
0
def preapreData():
    """
    prepares the sparse matrix for kmeans
    """
    con = db.getDBConnection()
    rowCount = 0
    global repoLangMap
    for repoId in repoList:
        lang = repoLangMap[int(repoId)]
        if lang != '' and lang is not None:
            langIndex = langMap[lang]
            S[rowCount, langIndex] = 10
        sql2 = 'select word, tf from keywords1 where repo_id= ' + str(repoId)
        #print sql2
        res = db.executeSQL(con, sql2)
        #print len(res), repoId

        for row in res:
            word = row[0]
            tf = int(row[1])
            if word in wordDFMap:
                val = wordDFMap[word]
                index = int(val.split(":")[0])
                df = int(val.split(":")[1])
                print rowCount, index
                S[rowCount, index] = tf * math.log(float(noOfRepos / df))

        rowCount = rowCount + 1
示例#5
0
def preapreData():
    """
    prepares the sparse matrix for kmeans
    """
    con = db.getDBConnection()
    rowCount =0
    global repoLangMap    
    for repoId in repoList:
        lang = repoLangMap[int(repoId)]
        if lang!='' and lang is not None:
            langIndex=langMap[lang]
            S[rowCount, langIndex] = 10 
        sql2 = 'select word, tf from keywords1 where repo_id= '+ str(repoId)
        #print sql2
        res = db.executeSQL(con, sql2)
        #print len(res), repoId
        
        for row in res:
            word = row[0]
            tf = int(row[1])
            if word in wordDFMap:
                val=wordDFMap[word]
                index = int(val.split(":")[0])
                df = int(val.split(":")[1])
                print rowCount, index
                S[rowCount, index] = tf * math.log(float(noOfRepos/df))
           
        rowCount = rowCount + 1
示例#6
0
def getRepos():
    """
    retrieve the repository ids, languages from the database
    """
    sql = 'select id, language from repository order by id ASC'
    con = db.getDBConnection()
    rows = db.executeSQL(con, sql)
    repoList = []
    global repoLangMap
    for row in rows:
        repoList.append(row[0])
        repoLangMap[row[0]] = row[1]
    return repoList
示例#7
0
def getRepos():
    """
    retrieve the repository ids, languages from the database
    """
    sql = 'select id, language from repository order by id ASC'
    con = db.getDBConnection()
    rows=db.executeSQL(con, sql);
    repoList = []
    global repoLangMap
    for row in rows:
        repoList.append(row[0])
        repoLangMap[row[0]] = row[1]
    return repoList
示例#8
0
def getCleanKeywords():
    """
    retrieve the words, document frequency>5 from the database
    """
    con = db.getDBConnection()
    sql = 'select word, df from clean_keywords1 where df>5'
    cleanWordRows = db.executeSQL(con, sql)
    counter = 0
    wordDFMap = {}
    for row in cleanWordRows:
        word = row[0]
        df = row[1]
        wordDFMap[word] = str(counter) + ":" + str(df)
        counter = counter + 1
    return wordDFMap
示例#9
0
def getCleanKeywords():
    """
    retrieve the words, document frequency>5 from the database
    """
    con = db.getDBConnection()
    sql = 'select word, df from clean_keywords1 where df>5'
    cleanWordRows = db.executeSQL(con, sql)
    counter =0
    wordDFMap ={}
    for row in cleanWordRows:
        word = row[0]
        df = row[1]
        wordDFMap[word] = str(counter)+":"+str(df)
        counter = counter+1
    return wordDFMap
示例#10
0
def readFiles():
    """
    read json files and process it
    """
    conn= db.getDBConnection()   
    for zippedFile in os.listdir("."):
        print zippedFile
        try:
            f = gzip.open(zippedFile, 'rb')
            file_content = f.read()
            parseJson(file_content, conn)
        except Exception as e:
            print 'Error in line:'+str(sys.exc_traceback.tb_lineno)
            pass
        finally:
            f.close() 
示例#11
0
def getLanguage():
    """
    retireves the distinct languages from the database
    """
    con = db.getDBConnection()
    sql = 'select distinct language from repository'
    res = db.executeSQL(con, sql)
    global noOfWords
    global langMap
    index = noOfWords
    print 'languages index' + str(index)
    for row in res:
        lang = row[0]
        if lang != '':
            print lang
            langMap[lang] = index
            index += 1
示例#12
0
def getLanguage():
    """
    retireves the distinct languages from the database
    """
    con = db.getDBConnection()
    sql = 'select distinct language from repository'
    res = db.executeSQL(con, sql)
    global noOfWords
    global langMap
    index=noOfWords
    print 'languages index' + str(index)
    for row in res:
        lang = row[0]
        if lang !='':
            print lang
            langMap[lang] = index
            index += 1
示例#13
0
def calculateTFIDF():
    """
    calculates the term frequency- inverse document frequency and stores in the table
    """
    conn = db.getDBConnection()
    cursor = conn.cursor()
    sql = "select word from clean_keywords1"
    print sql
    rows = db.executeSQL(conn, sql)
    wordTFMap = {}
    wordDFMap = {}
    for row in rows:
        word = row[0]
        sql1 = "select repo_id from keywords1 where word='" + word + "'"
        print sql1
        res = db.executeSQL(conn, sql1)
        for row1 in res:
            repoId = row1[0]
            key = word + ':' + str(repoId)
            if key in wordTFMap:
                tfCount = wordTFMap[key]
                wordTFMap[key] = tfCount + 1
            else:
                wordTFMap[key] = 1
                if word in wordDFMap:
                    dfCount = wordDFMap[word]
                    wordDFMap[word] = dfCount + 1
                else:
                    wordDFMap[word] = 1

    for key in wordDFMap.keys():
        sql = 'update clean_keywords1 set df=' + str(
            wordDFMap[key]) + " where word='" + key + "'"
        print sql
        cursor.execute(sql)
        conn.commit()

    for key in wordTFMap.keys():
        row = key.split(":")
        sql = 'update keywords1 set tf=' + str(
            wordTFMap[key]
        ) + " where word='" + row[0] + "' and repo_id=" + str(row[1])
        print sql
        cursor.execute(sql)
        conn.commit()
示例#14
0
def calculateTFIDF():
    """
    calculates the term frequency- inverse document frequency and stores in the table
    """
    conn = db.getDBConnection()
    cursor = conn.cursor()
    sql = "select word from clean_keywords1"
    print sql
    rows = db.executeSQL(conn, sql)
    wordTFMap = {}
    wordDFMap = {}
    for row in rows:
        word=row[0]
        sql1 = "select repo_id from keywords1 where word='"+word+"'"
        print sql1
        res=db.executeSQL(conn, sql1)
        for row1 in res:
            repoId = row1[0]
            key = word + ':'+ str(repoId)
            if key in wordTFMap:
                tfCount = wordTFMap[key]
                wordTFMap[key] = tfCount+1
            else:
                wordTFMap[key] = 1
                if word in wordDFMap:
                    dfCount = wordDFMap[word]
                    wordDFMap[word] = dfCount+1
                else:
                    wordDFMap[word] =1
    
    for key in wordDFMap.keys():
        sql = 'update clean_keywords1 set df='+str(wordDFMap[key]) + " where word='"+key+"'"
        print sql
        cursor.execute(sql)
        conn.commit() 
        
    for key in wordTFMap.keys():
        row=key.split(":")
        sql = 'update keywords1 set tf='+str(wordTFMap[key])+" where word='"+row[0]+"' and repo_id="+str(row[1])
        print sql
        cursor.execute(sql)
        conn.commit()            
示例#15
0
def cleanKeyWords():
    """
    inserts the distinct words and occurence into the table
    """
    conn = db.getDBConnection()
    cursor = conn.cursor()
    sql = "select word from keywords1"
    rows= db.executeSQL(conn, sql)
    wordMap = {} 
    for row in rows:
        word = row[0]
        if word in wordMap:
            count=wordMap.get(word)
            wordMap[word] = count+1
        else:
            wordMap[word] = 1
    counter = 1
    for key in wordMap.keys():
        if (util.emptyString(key) ==0):
            sql1="insert into clean_keywords1 values ("+str(counter)+",'"+key+"',"+str(wordMap[key])+","+str(0)+")"
            print sql1
            cursor.execute(sql1)
            conn.commit()
            counter = counter+1