def modelSelect(train_file, order): #File path to lm.ini depends on where you run it config_file = Config(LMFILE) lm_args = config_file.ConfigSectionMap("language_model") if(lm_args["model_type"] == "MITLM"): #Get vocab... print("%s -t %s -write-vocab %s.vocab" % (lm_args["location"],train_file, train_file)) os.system("%s -t %s -write-vocab %s.vocab" % (lm_args["location"],train_file, train_file)) #Alternative uses mitlm instead... print('%s -order %d -v %s.vocab -unk -smoothing ModKN -t %s -write-lm %s.%dgrams' % (lm_args["location"],order, train_file, train_file, train_file, order)) os.system('%s -order %d -v %s.vocab -unk -smoothing ModKN -t %s -write-lm %s.%dgrams' % (lm_args["location"],order, train_file, train_file, train_file, order)) elif(lm_args["model_type"] == "SRILM"): #Original Srilm print('%s -text %s -lm %s.kn.lm.gz -order %d -unk -kndiscount -interpolate -gt3min 1 -gt4min 1 -gt5min 1' % (lm_args["location"],train_file, train_file, order)) os.system('%s -text %s -lm %s.kn.lm.gz -order %d -unk -kndiscount -interpolate -gt3min 1 -gt4min 1 -gt5min 1' % (lm_args["location"],train_file, train_file, order)) print('%s -lm %s.kn.lm.gz -unk -order %d -write-lm %s.%dgrams' % (lm_args["location2"],train_file, order, train_file, order)) os.system('%s -lm %s.kn.lm.gz -unk -order %d -write-lm %s.%dgrams' % (lm_args["location2"],train_file, order, train_file, order)) os.system('rm %s.kn.lm.gz' % train_file) elif(lm_args["model_type"] == "KENLM"): #Kenlm (Old - comparable to srilm #print('%s -o %d --interpolate_unigrams 0 <%s >%s.%dgrams'% (lm_args["location"],order, train_file, train_file, order)) #os.system('%s -o %d --interpolate_unigrams 0 <%s >%s.%dgrams'% (lm_args["location"],order, train_file, train_file, order)) #(--interpolate_unigrams not recommened) print('%s -o %d <%s >%s.%dgrams'% (lm_args["location"],order, train_file, train_file, order)) os.system('%s -o %d <%s >%s.%dgrams'% (lm_args["location"],order, train_file, train_file, order)) else: print("This is not a recognized language model.") print("Check ./lm.ini to make sure \'model\' is one") print("of MITLM,SRILM,or KENLM")
def train(input_file_dir, fold_num, order, downSample, splitSelection): create_fold(input_file_dir, fold_num, downSample, splitSelection) pipes = [os.pipe() for i in xrange(fold_num)] for i in xrange(fold_num): pid = os.fork() if pid == 0: os.close(pipes[i][0]) train_file = '%s/fold%d.train' % (input_file_dir, i) discountStr = buildSmoother(order) #File path to lm.ini depends on where you run it config_file = Config(LMFILE) lm_args = config_file.ConfigSectionMap("language_model") if(lm_args["model_type"] == "MITLM"): #Get vocab... print("%s -t %s -write-vocab %s.vocab" % (lm_args["location"],train_file, train_file)) os.system("%s -t %s -write-vocab %s.vocab" % (lm_args["location"],train_file, train_file)) #Alternative uses mitlm instead... print('%s -order %d -v %s.vocab -unk -smoothing ModKN -t %s -write-lm %s.%dgrams' % (lm_args["location"],order, train_file, train_file, train_file, order)) os.system('%s -order %d -v %s.vocab -unk -smoothing ModKN -t %s -write-lm %s.%dgrams' % (lm_args["location"],order, train_file, train_file, train_file, order)) elif(lm_args["model_type"] == "SRILM"): #Original Srilm print('%s -text %s -lm %s.kn.lm.gz -order %d -unk -kndiscount -interpolate' % (lm_args["location"],train_file, train_file, order)) os.system('%s -text %s -lm %s.kn.lm.gz -order %d -unk -kndiscount -interpolate' % (lm_args["location"],train_file, train_file, order)) print('%s -lm %s.kn.lm.gz -unk -order %d -write-lm %s.%dgrams' % (lm_args["location2"],train_file, order, train_file, order)) os.system('%s -lm %s.kn.lm.gz -unk -order %d -write-lm %s.%dgrams' % (lm_args["location2"],train_file, order, train_file, order)) os.system('rm %s.kn.lm.gz' % train_file) elif(lm_args["model_type"] == "KENLM"): #Kenlm print('%s -o %d -S %s --interpolate_unigrams 0 <%s >%s.%dgrams'% (lm_args["location"],order, "5%", train_file, train_file, order)) os.system('%s -o %d -S %s --interpolate_unigrams 0 <%s >%s.%dgrams'% (lm_args["location"],order, "5%" ,train_file, train_file, order)) else: print("This is not a recognized language model.") print("Check ./lm.ini to make sure \'model\' is one") print("of MITLM,SRILM,or KENLM") #Using the berkeleylm as a base instead: #print("java -ea -mx1000m -server -cp ~/berkeleylm/src edu.berkeley.nlp.lm.io.MakeKneserNeyArpaFromText %d %s.%dgrams %s" % (order, train_file, order, train_file)) #os.system("java -ea -mx1000m -server -cp ~/berkeleylm/src edu.berkeley.nlp.lm.io.MakeKneserNeyArpaFromText %d %s.%dgrams %s" % (order, train_file, order, train_file) sys.exit() else: os.close(pipes[i][1]) for p in pipes: os.wait()
def __init__(self, text = "", language = "C", config_file = Util.CONFIG): #Get the keyword file through the Config and .ini system cfg = Config(config_file) db_config = cfg.ConfigSectionMap("Keywords") self.KeyWordFile = db_config['file'] self.text = text self.functions = [] self.initialized = False self.total_add = 0 self.total_del = 0 self.header = "" #What is the name given after '@@' in log self.langSwitch = LanguageSwitcherFactory.LanguageSwitcherFactory.createLS(language) self.sT = ScopeTrackerFactory.ScopeTrackerFactory.createST(self.langSwitch)
def test(config_file): cfg = Config(config_file) db_config = cfg.ConfigSectionMap("Database") print("Database configuration = %r\n", db_config) dbCon = DatabaseCon(db_config['database'], db_config['user'], db_config['host'], db_config['port'], db_config['password']) sql_command = "SELECT language, project, min(commit_date), max(commit_date)" sql_command += " FROM " + db_config['table'] + " Where language iLike \'java\' group by language, project" rows = dbCon.execute(sql_command) for r in rows: print(r)
class ConfigInfo: ''' This class contains information about the config file while providing options to directly access the flags section of the .ini file. ''' def __init__(self, newFile): self.setConfigFile(newFile) def setConfigFile(self, newFile): self.CONFIG = newFile self.cfg = Config(self.CONFIG) option_flags = self.cfg.ConfigSectionMap("Flags") self.SEP = option_flags['sep'] self.DEBUG = bool(util.strtobool(option_flags['debug'])) self.DEBUGLITE = bool(util.strtobool(option_flags['debuglite'])) self.DATABASE = bool(util.strtobool(option_flags['database'])) self.CSV = bool(util.strtobool(option_flags['csv'])) self.LOGTIME = bool(util.strtobool(option_flags['logtime']))
class dumpLogs: def __init__(self, password, c_info): self.config_info = c_info self.cfg = Config(self.config_info.CONFIG) self.dbPass = password self.connectDb() #self.cleanDb() @staticmethod def getFullTitleString(keywordDictionary): ''' Create a string specifying not only the database column names but also their types. This is used when automatically creating the database table. ''' dictStr = "(project character varying(500), sha text, language character varying(500)," + \ " file_name text, is_test boolean, method_name text" for key, value in keywordDictionary.items(): dictStr= dictStr+", \""+ str(key).replace(" ", "_").replace("(", "_").replace(")", "_") + \ "\" integer" #ToStr will add ' around the strings... dictStr += ", total_adds integer, total_dels integer, warning_alert boolean)" return dictStr def connectDb(self): self.db_config = self.cfg.ConfigSectionMap("Database") logging.debug("Database configuration = %r\n", self.db_config) self.dbCon = DatabaseCon(self.db_config['database'], self.db_config['user'], \ self.db_config['host'], self.db_config['port'], \ self.dbPass) def cleanDb(self): schema = self.db_config['schema'] response = 'y' # raw_input("Deleting database %s ?" % (self.db_config['schema'])) schema = self.db_config['schema'] tables = [] tables.append(schema + "." + self.db_config['table_method_detail']) tables.append(schema + "." + self.db_config['table_change_summary']) if response.lower().startswith('y'): for table in tables: print(("Deleting table %r \n" % table)) sql_command = "DELETE FROM " + table self.dbCon.insert(sql_command) self.dbCon.commit() def close(self): self.dbCon.commit() self.dbCon.close() #TODO: Improve security here for possible injections? def createSummaryTable(self): schema = self.db_config['schema'] table = schema + "." + self.db_config['table_change_summary'] user = self.db_config['user'] sql_command = "CREATE TABLE IF NOT EXISTS " + table + " (project character varying(500) NOT NULL," + \ " sha text NOT NULL, author character varying(500), author_email character varying(500)," + \ " commit_date date, is_bug boolean,"+ \ " CONSTRAINT change_summary_pkey PRIMARY KEY (project, sha)) WITH (OIDS=FALSE);" self.dbCon.create(sql_command) #self.dbCon.create("ALTER TABLE " + table + " OWNER TO " + user + ";") #self.dbCon.create("GRANT ALL ON TABLE " + table + " TO " + user + ";") def createMethodChangesTable(self, titleString): schema = self.db_config['schema'] table = schema + "." + self.db_config['table_method_detail'] user = self.db_config['user'] sql_command = "CREATE TABLE IF NOT EXISTS " + table + titleString + " WITH (OIDS=FALSE);" self.dbCon.create(sql_command) #self.dbCon.create("ALTER TABLE " + table + " OWNER TO " + user + ";") #self.dbCon.create("GRANT ALL ON TABLE " + table + " TO " + user + ";") def dumpSummary(self, summaryStr): schema = self.db_config['schema'] table = schema + "." + self.db_config['table_change_summary'] sql_command = "INSERT INTO " + table + \ "(project, sha, author, author_email, commit_date, is_bug)" + \ " VALUES (" + summaryStr + ")" #print sql_command self.dbCon.insert(sql_command) #self.dbCon.commit() def dumpMethodChanges(self, methodChange, titleString): schema = self.db_config['schema'] table = schema + "." + self.db_config['table_method_detail'] #sql_command = "INSERT INTO " + table + \ # "(project, sha, language, file_name, is_test, method_name, assertion_add, " + \ # "assertion_del, total_add, total_del)" + \ # "VALUES (" + methodChange + ")" sql_command = "INSERT INTO " + table + titleString + " VALUES (" + methodChange + ")" if (self.config_info.DEBUG): print(sql_command) self.dbCon.insert(sql_command)
parser.add_argument('-c', '--command', help='counter, condition, policy, vmprofile or vmgroup', required=True) parser.add_argument('-o', '--option', help='list, create or delete', required=True) args = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) config = Config() api = config.ConfigSectionMap("ConfigApi")['api'] apikey = config.ConfigSectionMap("ConfigApi")['apikey'] secret = config.ConfigSectionMap("ConfigApi")['secret'] cloudstack = CloudStack.Client(api, apikey, secret) project = config.ConfigSectionMap("Envs")['project'] zone = config.ConfigSectionMap("Envs")['zone'] projectid = listProjectId(project) zoneid = listZoneId(zone) if args.option == 'list': if args.command == 'counter': print Colors.BOLD + "Listing counters:" + Colors.ENDC print listCounters() elif args.command == 'condition':
def processLog(self, config=""): if (config == ""): config = self.config_info.CONFIG signal.signal(signal.SIGALRM, timeout) project1 = os.path.split(self.log_file)[0] project1 = project1.rstrip(os.sep) self.project_name = os.path.basename(project1) print(("---------- %s ------------\n" % (self.project_name))) if (self.config_info.DATABASE): dl = dumpLogs(self.dbPass, self.config_info) if (self.config_info.CSV): if not os.path.isdir("../Results"): os.mkdir("../Results") inf1 = open( "../Results/" + str(self.project_name) + "ChangeSummary.csv", 'w') fPtrChangeSummary = open("../Results/" + "ChangeSummary.csv", 'w') inf1.write("project,sha,author,author_email,commit_date,is_bug\n") inf2 = open( "../Results/" + str(self.project_name) + "PatchSummary.csv", 'w') fPtrPatchSummary = open("../Results/" + "PatchSummary.csv", 'w') lst = [] listToDict = {} mockChunk = logChunk( "", "C", self.config_info ) #TODO: This is C specific, Why is this C specific? lst = mockChunk.readKeywords(lst) keywords = [k[0] for k in lst if k[1] == INCLUDED] for keyword in keywords: listToDict["\"" + str(keyword) + "\" adds"] = 0 listToDict["\"" + str(keyword) + "\" dels"] = 0 inf2.write( "project, sha, language, file_name, is_test, method_name,total_add,total_del,%s\n" % ",".join(sorted(listToDict.keys()))) inf = codecs.open(self.log_file, "r", "iso-8859-1") shaObj = None patchObj = None is_diff = False log_mssg = "" is_no_prev_ver = False is_no_next_ver = False curLogChunk = logChunk("", "C", self.config_info) linenum = 0 for l in inf: try: signal.alarm(0) sha = self.isSha(l) line = l #if(self.config_info.DEBUGLITE): # try: # print(line) # except: # pass if sha: #Reverting back to version that outputs at the end... #if(shaObj != None): # if(self.config_info.DEBUGLITE): # print("Writing Sha:" + sha) # if(self.config_info.DATABASE): # shaObj.dumpSha(dl) # elif(self.config_info.CSV): # shaObj.shaToCsv(inf1,inf2,fPtrChangeSummary,fPtrPatchSummary) # else: # shaObj.printSha() shaObj = Sha(self.project_name, sha) #if(self.config_info.DEBUGLITE): #Save for testing. self.shas.append( shaObj ) #This will become very memory intensive in large git logs. is_diff = False log_mssg = "" continue elif self.isAuthor(line, shaObj): continue elif self.isDate(line, shaObj): continue fullLine = line line = line.rstrip() if line.startswith('diff --git '): shaObj.setLog(log_mssg) is_diff = True is_no_prev_ver = False is_no_next_ver = False continue if patchObj != None: shaObj.patches.append(patchObj) elif is_diff == False: if not line.strip(): continue log_mssg += line + "\t" if is_diff: if line.startswith("--- a/"): #Finish the changes to the old patch object if (patchObj != None): #If there is an existing chunk to parse, process it if (curLogChunk.header != ""): if (self.config_info.DEBUG): print(("New diff with previous version: " + line)) print(("HEADER: " + curLogChunk.header)) self.processLastChunk(patchObj, curLogChunk) #Reset the current chunk obj if (self.config_info.DEBUG): print("Resetting.") curLogChunk.reset() curLogChunk.setLang( "." + self.cur_lang) #DOUBLE CHECK ME! patchObj = self.createPatch(line) shaObj.patches.append(patchObj) #print patchObj #print shaObj.patches elif (line == '--- /dev/null'): #earlier file was empty is_no_prev_ver = True elif (line == '+++ /dev/null' ): #next file version was empty is_no_next_ver = True continue elif (is_no_prev_ver == True) and line.startswith("+++ b/"): #Finish the changes to the old patch object if (patchObj != None): if (curLogChunk.header != ""): #If there is an existing chunk if (self.config_info.DEBUG): print(( "New diff with no previous version: " + line)) print(("HEADER: " + curLogChunk.header)) self.processLastChunk(patchObj, curLogChunk) if (self.config_info.DEBUG): print("Resetting.") curLogChunk.reset() curLogChunk.setLang( "." + self.cur_lang) #DOUBLE CHECK ME! patchObj = self.createPatchWithNoPrevVersion(line) shaObj.patches.append(patchObj) else: #Then we reached a content line. self.processPatch(fullLine, patchObj, curLogChunk) except TimeExceededError.TimeExceededError: print("Line Timed out, moving to next.") continue #Clear timeouts. signal.alarm(0) #Make sure to get the last patch in the file! if (curLogChunk.header != ""): #If there is an existing chunk to parse if (self.config_info.DEBUG): print(("Last Patch: " + line)) print(("HEADER: " + curLogChunk.header)) self.processLastChunk(patchObj, curLogChunk) #if shaObj != None: # shaObj.patches.append(patchObj) parseFinish = datetime.now() if (self.shas != []): #If the log wasn't empty... #Create the change summary table and the method change table now if necessary if (self.config_info.DATABASE): cfg = Config(self.config_info.CONFIG) keywordFile = cfg.ConfigSectionMap("Keywords") full_title = dumpLogs.getFullTitleString( curLogChunk.getEmptyKeywordDict()) dl.createSummaryTable() if ( full_title != "" ): #Check if the changes table exists and create it if we have a title. dl.createMethodChangesTable(full_title) for s in self.shas: #s.printSha() if s != None: if (self.config_info.DATABASE): s.dumpSha(dl) elif (self.config_info.CSV): s.shaToCsv(inf1, inf2, fPtrChangeSummary, fPtrPatchSummary) else: s.printSha() #Write out last sha. #if(shaObj != None and self.config_info.DATABASE): # if(self.config_info.DEBUGLITE): # print("Writing to db.") # shaObj.dumpSha(dl) if (self.config_info.DATABASE): print("Closing Time.") dl.close() if (self.config_info.CSV): inf1.close() inf2.close() fPtrChangeSummary.close() fPtrPatchSummary.close() print("Sha's processed:") print((len(self.shas))) return parseFinish
class ConfigInfo: def __init__(self, newFile): self.configFile = newFile self.cfg = Config(self.configFile) self.config_db = self.cfg.ConfigSectionMap("Database") self.config_repo = self.cfg.ConfigSectionMap("Repos") self.config_key = self.cfg.ConfigSectionMap("Keywords") self.config_log = self.cfg.ConfigSectionMap("Log") self.config_flags = self.cfg.ConfigSectionMap("Flags") self.setFlags() def setFlags(self): self.SEP = self.config_flags['sep'].strip('\'') self.DEBUG = bool(util.strtobool(self.config_flags['debug'])) self.DEBUGLITE = bool(util.strtobool(self.config_flags['debuglite'])) self.DATABASE = bool(util.strtobool(self.config_flags['database'])) self.CSV = bool(util.strtobool(self.config_flags['csv'])) self.LOGTIME = bool(util.strtobool(self.config_flags['logtime'])) self.SZZ = bool(util.strtobool(self.config_flags['szz'])) def getRepos(self): repos = set() try: repo_file = self.config_repo['repo_url_file'] f = open(repo_file, 'r') for line in f: repo_url = line.strip() _, repo = repo_url.split(os.sep) repos.add(repo) except IOError: print "!! Repo url file \"%s\" does not exist." % repo_file print "... Going to process all the repositories in the directory : \"%s\"." % self.getDumpLocation() repo_file = None return repos def getGitUrl(self,projName): git_url = "" try: repo_file = self.config_repo['repo_url_file'] f = open(repo_file, 'r') for line in f: repo_url = line.strip() url, repo = repo_url.split(os.sep) if repo == projName: git_url = "http://github.com/" + repo_url break except IOError: print "!! Repo url file \"%s\" does not exist." % repo_file print "... Going to process all the repositories in the directory : \"%s\"." % self.getDumpLocation() repo_file = None return git_url def getProjectLocation(self, projName): dump_loc = self.config_repo['repo_locations'] proj_loc = os.path.join(dump_loc,projName) return proj_loc def getDumpLocation(self): return self.config_repo['repo_locations'] def getPatchMode(self): try: patch = bool(util.strtobool(self.config_log['patch'])) except: patch = True return patch def getBugPatchMode(self): try: bug_patch = bool(util.strtobool(self.config_log['bugPatch'])) except: bug_patch = True return bug_patch def getLanguages(self): try: langs = self.config_log['languages'].split(",") except: langs = [] #Treat empty as showing all supported languages. return langs
class dumpLogs: def __init__(self, configFile='config.ini'): self.cfg = Config(configFile) self.connectDb() #self.cleanDb() def connectDb(self): self.db_config = self.cfg.ConfigSectionMap("Database") logging.debug("Database configuration = %r\n", self.db_config) self.dbCon = DatabaseCon(self.db_config['database'], self.db_config['user'], \ self.db_config['host'], self.db_config['port']) def cleanDb(self): schema = self.db_config['schema'] response = 'y' # raw_input("Deleting database %s ?" % (self.db_config['schema'])) schema = self.db_config['schema'] tables = [] tables.append(schema + "." + self.db_config['table_method_detail']) tables.append(schema + "." + self.db_config['table_change_summary']) if response.lower().startswith('y'): for table in tables: print("Deleting table %r \n" % table) sql_command = "DELETE FROM " + table self.dbCon.insert(sql_command) self.dbCon.commit() def close(self): self.dbCon.commit() self.dbCon.close() def dumpSummary(self, summaryStr): schema = self.db_config['schema'] table = schema + "." + self.db_config['table_change_summary'] sql_command = "INSERT INTO " + table + \ "(project, sha, author, commit_date, is_bug)" + \ "VALUES (" + summaryStr + ")" #print sql_command self.dbCon.insert(sql_command) #self.dbCon.commit() def dumpMethodChanges(self, methodChange): schema = self.db_config['schema'] table = schema + "." + self.db_config['table_method_detail'] sql_command = "INSERT INTO " + table + \ "(project, sha, language, file_name, is_test, method_name, assertion_add, " + \ "assertion_del, total_add, total_del)" + \ "VALUES (" + methodChange + ")" #print sql_command self.dbCon.insert(sql_command)