def cleanKeyWords(): """ inserts the distinct words and occurence into the table """ conn = db.getDBConnection() cursor = conn.cursor() sql = "select word from keywords1" rows = db.executeSQL(conn, sql) wordMap = {} for row in rows: word = row[0] if word in wordMap: count = wordMap.get(word) wordMap[word] = count + 1 else: wordMap[word] = 1 counter = 1 for key in wordMap.keys(): if (util.emptyString(key) == 0): sql1 = "insert into clean_keywords1 values (" + str( counter) + ",'" + key + "'," + str( wordMap[key]) + "," + str(0) + ")" print sql1 cursor.execute(sql1) conn.commit() counter = counter + 1
def getdata(): """ retrieves the data from repository table removes the special characters and stop words and does the stemming(using nltk package) """ conn=db.getDBConnection() cursor = conn.cursor() global stopWordSet sql = "select id, description from repository" rows = db.executeSQL(conn, sql) counter=1 wnl = WordNetLemmatizer() for row in rows: id = row[0] desc= row[1] #print desc if desc is not None: desc=desc.replace('-',' ').replace(',',' ').replace('/',' ').replace('.',' ').replace('_',' ') desc = desc.lower() desc = re.sub('[^a-z0-9 ]','',desc) keywords = desc.split(" ") for word in keywords: #word = porter.stem(word.strip()) word=wnl.lemmatize(word.strip()) if word not in stopWordSet: sql1 = "insert into keywords1 values("+str(counter)+",'"+word+"',"+str(id)+ ',' + str(0) + ")" print sql1 cursor.execute(sql1) conn.commit() counter = counter+1
def getdata(): """ retrieves the data from repository table removes the special characters and stop words and does the stemming(using nltk package) """ conn = db.getDBConnection() cursor = conn.cursor() global stopWordSet sql = "select id, description from repository" rows = db.executeSQL(conn, sql) counter = 1 wnl = WordNetLemmatizer() for row in rows: id = row[0] desc = row[1] #print desc if desc is not None: desc = desc.replace('-', ' ').replace(',', ' ').replace( '/', ' ').replace('.', ' ').replace('_', ' ') desc = desc.lower() desc = re.sub('[^a-z0-9 ]', '', desc) keywords = desc.split(" ") for word in keywords: #word = porter.stem(word.strip()) word = wnl.lemmatize(word.strip()) if word not in stopWordSet: sql1 = "insert into keywords1 values(" + str( counter) + ",'" + word + "'," + str(id) + ',' + str( 0) + ")" print sql1 cursor.execute(sql1) conn.commit() counter = counter + 1
def preapreData(): """ prepares the sparse matrix for kmeans """ con = db.getDBConnection() rowCount = 0 global repoLangMap for repoId in repoList: lang = repoLangMap[int(repoId)] if lang != '' and lang is not None: langIndex = langMap[lang] S[rowCount, langIndex] = 10 sql2 = 'select word, tf from keywords1 where repo_id= ' + str(repoId) #print sql2 res = db.executeSQL(con, sql2) #print len(res), repoId for row in res: word = row[0] tf = int(row[1]) if word in wordDFMap: val = wordDFMap[word] index = int(val.split(":")[0]) df = int(val.split(":")[1]) print rowCount, index S[rowCount, index] = tf * math.log(float(noOfRepos / df)) rowCount = rowCount + 1
def preapreData(): """ prepares the sparse matrix for kmeans """ con = db.getDBConnection() rowCount =0 global repoLangMap for repoId in repoList: lang = repoLangMap[int(repoId)] if lang!='' and lang is not None: langIndex=langMap[lang] S[rowCount, langIndex] = 10 sql2 = 'select word, tf from keywords1 where repo_id= '+ str(repoId) #print sql2 res = db.executeSQL(con, sql2) #print len(res), repoId for row in res: word = row[0] tf = int(row[1]) if word in wordDFMap: val=wordDFMap[word] index = int(val.split(":")[0]) df = int(val.split(":")[1]) print rowCount, index S[rowCount, index] = tf * math.log(float(noOfRepos/df)) rowCount = rowCount + 1
def getRepos(): """ retrieve the repository ids, languages from the database """ sql = 'select id, language from repository order by id ASC' con = db.getDBConnection() rows = db.executeSQL(con, sql) repoList = [] global repoLangMap for row in rows: repoList.append(row[0]) repoLangMap[row[0]] = row[1] return repoList
def getRepos(): """ retrieve the repository ids, languages from the database """ sql = 'select id, language from repository order by id ASC' con = db.getDBConnection() rows=db.executeSQL(con, sql); repoList = [] global repoLangMap for row in rows: repoList.append(row[0]) repoLangMap[row[0]] = row[1] return repoList
def getCleanKeywords(): """ retrieve the words, document frequency>5 from the database """ con = db.getDBConnection() sql = 'select word, df from clean_keywords1 where df>5' cleanWordRows = db.executeSQL(con, sql) counter = 0 wordDFMap = {} for row in cleanWordRows: word = row[0] df = row[1] wordDFMap[word] = str(counter) + ":" + str(df) counter = counter + 1 return wordDFMap
def getCleanKeywords(): """ retrieve the words, document frequency>5 from the database """ con = db.getDBConnection() sql = 'select word, df from clean_keywords1 where df>5' cleanWordRows = db.executeSQL(con, sql) counter =0 wordDFMap ={} for row in cleanWordRows: word = row[0] df = row[1] wordDFMap[word] = str(counter)+":"+str(df) counter = counter+1 return wordDFMap
def readFiles(): """ read json files and process it """ conn= db.getDBConnection() for zippedFile in os.listdir("."): print zippedFile try: f = gzip.open(zippedFile, 'rb') file_content = f.read() parseJson(file_content, conn) except Exception as e: print 'Error in line:'+str(sys.exc_traceback.tb_lineno) pass finally: f.close()
def getLanguage(): """ retireves the distinct languages from the database """ con = db.getDBConnection() sql = 'select distinct language from repository' res = db.executeSQL(con, sql) global noOfWords global langMap index = noOfWords print 'languages index' + str(index) for row in res: lang = row[0] if lang != '': print lang langMap[lang] = index index += 1
def getLanguage(): """ retireves the distinct languages from the database """ con = db.getDBConnection() sql = 'select distinct language from repository' res = db.executeSQL(con, sql) global noOfWords global langMap index=noOfWords print 'languages index' + str(index) for row in res: lang = row[0] if lang !='': print lang langMap[lang] = index index += 1
def calculateTFIDF(): """ calculates the term frequency- inverse document frequency and stores in the table """ conn = db.getDBConnection() cursor = conn.cursor() sql = "select word from clean_keywords1" print sql rows = db.executeSQL(conn, sql) wordTFMap = {} wordDFMap = {} for row in rows: word = row[0] sql1 = "select repo_id from keywords1 where word='" + word + "'" print sql1 res = db.executeSQL(conn, sql1) for row1 in res: repoId = row1[0] key = word + ':' + str(repoId) if key in wordTFMap: tfCount = wordTFMap[key] wordTFMap[key] = tfCount + 1 else: wordTFMap[key] = 1 if word in wordDFMap: dfCount = wordDFMap[word] wordDFMap[word] = dfCount + 1 else: wordDFMap[word] = 1 for key in wordDFMap.keys(): sql = 'update clean_keywords1 set df=' + str( wordDFMap[key]) + " where word='" + key + "'" print sql cursor.execute(sql) conn.commit() for key in wordTFMap.keys(): row = key.split(":") sql = 'update keywords1 set tf=' + str( wordTFMap[key] ) + " where word='" + row[0] + "' and repo_id=" + str(row[1]) print sql cursor.execute(sql) conn.commit()
def calculateTFIDF(): """ calculates the term frequency- inverse document frequency and stores in the table """ conn = db.getDBConnection() cursor = conn.cursor() sql = "select word from clean_keywords1" print sql rows = db.executeSQL(conn, sql) wordTFMap = {} wordDFMap = {} for row in rows: word=row[0] sql1 = "select repo_id from keywords1 where word='"+word+"'" print sql1 res=db.executeSQL(conn, sql1) for row1 in res: repoId = row1[0] key = word + ':'+ str(repoId) if key in wordTFMap: tfCount = wordTFMap[key] wordTFMap[key] = tfCount+1 else: wordTFMap[key] = 1 if word in wordDFMap: dfCount = wordDFMap[word] wordDFMap[word] = dfCount+1 else: wordDFMap[word] =1 for key in wordDFMap.keys(): sql = 'update clean_keywords1 set df='+str(wordDFMap[key]) + " where word='"+key+"'" print sql cursor.execute(sql) conn.commit() for key in wordTFMap.keys(): row=key.split(":") sql = 'update keywords1 set tf='+str(wordTFMap[key])+" where word='"+row[0]+"' and repo_id="+str(row[1]) print sql cursor.execute(sql) conn.commit()
def cleanKeyWords(): """ inserts the distinct words and occurence into the table """ conn = db.getDBConnection() cursor = conn.cursor() sql = "select word from keywords1" rows= db.executeSQL(conn, sql) wordMap = {} for row in rows: word = row[0] if word in wordMap: count=wordMap.get(word) wordMap[word] = count+1 else: wordMap[word] = 1 counter = 1 for key in wordMap.keys(): if (util.emptyString(key) ==0): sql1="insert into clean_keywords1 values ("+str(counter)+",'"+key+"',"+str(wordMap[key])+","+str(0)+")" print sql1 cursor.execute(sql1) conn.commit() counter = counter+1