def main(): par = plyj.parser.Parser() file_path = "../Java/Corpus/" cul = [] vocab = {} sentlens = {} sf = [] fields = [] ctr = 1 blacklist = [ "5a8beeae20366b5094d0db8148e0563", "3cd87ee90872cfcb72b3cb3b773d8efa" ] for subdir, dirs, files in os.walk(file_path): for f in files: clear = True for h in blacklist: if h in f: clear = False if f.endswith(".java") and clear: p = os.path.join(subdir, f) cus = e.ExtractCode(par, p) #cul.extend(cus) for i, cu in cus: #print cu.getStr() sf2, fi, sents = seq.getSents(cu, i, "levels") sf.extend(sf2) fields.extend(fi) print str(ctr) + ": " + str(len(sents)) ctr += 1 for sent, vl in sents: #print str(len(sf)) + " importables" #print str(len(fields)) + " fields" if not len(sent) in sentlens: sentlens[len(sent)] = 0 sentlens[len(sent)] += 1 if len(sent) > 0: for stat in sent: s = getSig(stat, vl) if not s[0] in vocab: vocab[s[0]] = [] vocab[s[0]].append(s[1:]) #break for s in vocab: print s for sig in resolveSigs(vocab[s]): print '\t' + e.nstr(sig) print len(vocab) print len(set(sf)) print len(set(fields)) print sentlens
def main(): par = plyj.parser.Parser() file_path = "../Java/Corpus/" cul = [] vocab = {} sentlens = {} sf = [] fields = [] ctr = 1 blacklist = ["5a8beeae20366b5094d0db8148e0563", "3cd87ee90872cfcb72b3cb3b773d8efa"] for subdir, dirs, files in os.walk(file_path): for f in files: clear = True for h in blacklist: if h in f: clear = False if f.endswith(".java") and clear: p = os.path.join(subdir, f) cus = e.ExtractCode(par, p) #cul.extend(cus) for i, cu in cus: #print cu.getStr() sf2, fi, sents = seq.getSents(cu, i, "levels") sf.extend(sf2) fields.extend(fi) print str(ctr) + ": " + str(len(sents)) ctr += 1 for sent, vl in sents: #print str(len(sf)) + " importables" #print str(len(fields)) + " fields" if not len(sent) in sentlens: sentlens[len(sent)] = 0 sentlens[len(sent)] += 1 if len(sent) > 0: for stat in sent: s = getSig(stat, vl) if not s[0] in vocab: vocab[s[0]] = [] vocab[s[0]].append(s[1:]) #break for s in vocab: print s for sig in resolveSigs(vocab[s]): print '\t' + e.nstr(sig) print len(vocab) print len(set(sf)) print len(set(fields)) print sentlens
def main(): par = plyj.parser.Parser() modes = ["cfs", "levels"] if len(sys.argv) > 1: mode = sys.argv[1] else: mode = "levels" if mode not in modes: mode = "levels" corpus_path = "../Java/Corpus/" data_path = "../Data/Raw" #### meth_name = "method_sentences_" + mode + ".txt" var_name = "variable_sentences_" + mode + ".txt" vocab_name = "vocab_" + mode + ".txt" #### meth_file = open(os.path.join(data_path, meth_name), 'w') var_file = open(os.path.join(data_path, var_name), 'w') vocab_file = open(os.path.join(data_path, vocab_name), 'w') #### vocab = {} sf = [] fields = [] ctr = 1 blacklist = ["5a8beeae20366b5094d0db8148e0563", "3cd87ee90872cfcb72b3cb3b773d8efa"] for subdir, dirs, files in os.walk(corpus_path): for f in files: clear = True for h in blacklist: if h in f: clear = False if f.endswith(".java") and clear: p = os.path.join(subdir, f) cus = e.ExtractCode(par, p) for i, cu in cus: sf2, fi, sents = seq.getSents(cu, i, mode) sf.extend(sf2) fields.extend(fi) print str(ctr) + ": " + str(len(sents)) ctr += 1 for sent, vl in sents: meth_file.write("<S2>\n") meth_file.write("<S1>\n") for stat, ctx in sent: meth_file.write(e.nstr(t.getSig(stat, vl, False)) + ' # ' + e.nstr(ctx) + '\n') s = t.getSig(stat, vl) if not s[0] in vocab: vocab[s[0]] = [] vocab[s[0]].append(s[1:]) meth_file.write('<END>\n') vsents = seq.getVarSents(sents) for vsent in vsents: var_file.write("<S2>\n") var_file.write("<S1>\n") for stat, ctx in vsent: var_file.write(e.nstr(stat) + '\n') var_file.write('<END>\n') #break for s in vocab: vocab_file.write(s + '\n') for sig in t.resolveSigs(vocab[s]): vocab_file.write('\t' + e.nstr(sig) + '\n') meth_file.close() var_file.close() vocab_file.close()
def main(): par = plyj.parser.Parser() modes = ["cfs", "levels"] if len(sys.argv) > 1: mode = sys.argv[1] else: mode = "levels" if mode not in modes: mode = "levels" corpus_path = "../Java/Corpus/" data_path = "../Data/Raw" #### meth_name = "method_sentences_" + mode + ".txt" var_name = "variable_sentences_" + mode + ".txt" vocab_name = "vocab_" + mode + ".txt" #### meth_file = open(os.path.join(data_path, meth_name), 'w') var_file = open(os.path.join(data_path, var_name), 'w') vocab_file = open(os.path.join(data_path, vocab_name), 'w') #### vocab = {} sf = [] fields = [] ctr = 1 blacklist = [ "5a8beeae20366b5094d0db8148e0563", "3cd87ee90872cfcb72b3cb3b773d8efa" ] for subdir, dirs, files in os.walk(corpus_path): for f in files: clear = True for h in blacklist: if h in f: clear = False if f.endswith(".java") and clear: p = os.path.join(subdir, f) cus = e.ExtractCode(par, p) for i, cu in cus: sf2, fi, sents = seq.getSents(cu, i, mode) sf.extend(sf2) fields.extend(fi) print str(ctr) + ": " + str(len(sents)) ctr += 1 for sent, vl in sents: meth_file.write("<S2>\n") meth_file.write("<S1>\n") for stat, ctx in sent: meth_file.write( e.nstr(t.getSig(stat, vl, False)) + ' # ' + e.nstr(ctx) + '\n') s = t.getSig(stat, vl) if not s[0] in vocab: vocab[s[0]] = [] vocab[s[0]].append(s[1:]) meth_file.write('<END>\n') vsents = seq.getVarSents(sents) for vsent in vsents: var_file.write("<S2>\n") var_file.write("<S1>\n") for stat, ctx in vsent: var_file.write(e.nstr(stat) + '\n') var_file.write('<END>\n') #break for s in vocab: vocab_file.write(s + '\n') for sig in t.resolveSigs(vocab[s]): vocab_file.write('\t' + e.nstr(sig) + '\n') meth_file.close() var_file.close() vocab_file.close()
def getLL(cu, i, seq_mode="levels", meth_prob="MEMM", var_prob=3, var_voc="pot", fill="max", num_guess=200): data_path = "../Data/Revised" vocab_name = "vocab_" + seq_mode + ".txt" counts_name = "counts_" + seq_mode + ".txt" memm_name = "memm_" + seq_mode + ".txt" vocab_file = open(os.path.join(data_path, vocab_name), 'rb') vocab_data = pickle.load(vocab_file) meth_sigs, meth_vocab_list, pot_var_vocab_list, act_var_vocab_list = vocab_data print "Read vocab data" vocab_file.close() if var_voc == "act": var_vocab_list = act_var_vocab_list pot_var_vocab_list = None else: var_vocab_list = pot_var_vocab_list act_var_vocab_list = None count_file = open(os.path.join(data_path, counts_name), 'rb') count_data = pickle.load(count_file) meth_count, pot_count, act_count = count_data print "Read count data" count_file.close() if var_voc == "act": var_count = act_count pot_count = None else: var_count = pot_count act_count = None if type(meth_prob) == int: meth_dat = [meth_prob, meth_vocab_list] meth_dat.extend(meth_count) prob_mode = "ngram" else: meth_count = None memm_file = open(os.path.join(data_path, memm_name), 'rb') meth_dat = pickle.load(memm_file) meth_dat = [meth_dat, meth_vocab_list] print "Read MEMM" prob_mode = "MEMM" memm_file.close() imp, fi, sents = seq.getSents(cu, i, seq_mode) word_ll = {} noUnk = True for word in meth_vocab_list: if type(word) is not str and word[0].split('$')[0] in i: ll = 0 ctr = 0 for sent, vl in sents: newsent = [] for stat, ctx in sent: if stat == "UNK": newsent.append((word, ctx)) noUnk = False else: f = stat[1] n = len(stat) - 2 newsent.append(((f, n), ctx)) newsent = seq.getFeatures([newsent])[0] newsent.insert(0, "<S1>") newsent.insert(0, "<S2>") newsent.append("<END>") if type(meth_prob) == int: ctr += len(newsent) - 2 else: ctr += len(newsent) - 3 ll += seqProb(newsent, meth_dat, prob_mode) ll /= ctr word_ll[word] = ll word_ll = sorted(word_ll.items(), key=operator.itemgetter(1), reverse=True) top_guess = {} i = 0 if noUnk: ll = 0 vsents = seq.getVarSents2(sents) var_dat = [var_prob, var_vocab_list] var_dat.extend(var_count) for sen in vsents: sen.insert(0, "<S1>") sen.insert(0, "<S2>") sen.append("<END>") ctr += len(sen) - 2 #print sen ll += varSeqProb(sen, var_dat) top_guess = [[["NILL"], ll / ctr]] else: if fill == "max": len_guess = 20 else: len_guess = 1 num_guess = 1 while i < len(word_ll) and len(top_guess) < len_guess: if fill == "max": w = word_ll[i][0] else: w = random.choice(word_ll)[0] i += 1 var_guess = var_guesses(w, cu, meth_sigs, num_guess) if len(var_guess) > 0: var_dat = [var_prob, var_vocab_list] var_dat.extend(var_count) for call in var_guess: ll = 0 ctr = 0 vsents = [] for sent, vl in sents: newsent = [] for stat, ctx in sent: if stat == "UNK": newsent.append((call, ctx)) else: newsent.append((stat, ctx)) vsents.append([newsent, vl]) vsents = seq.getVarSents2(vsents) for sen in vsents: sen.insert(0, "<S1>") sen.insert(0, "<S2>") sen.append("<END>") ctr += len(sen) - 2 #print sen ll += varSeqProb(sen, var_dat) top_guess[call] = ll / ctr top_guess = sorted(top_guess.items(), key=operator.itemgetter(1), reverse=True) return top_guess
def getLL(cu, i, seq_mode = "levels", meth_prob = "MEMM", var_prob = 3, var_voc = "pot", fill = "max", num_guess = 200): data_path = "../Data/Revised" vocab_name = "vocab_" + seq_mode + ".txt" counts_name = "counts_" + seq_mode + ".txt" memm_name = "memm_" + seq_mode + ".txt" vocab_file = open(os.path.join(data_path, vocab_name), 'rb') vocab_data = pickle.load(vocab_file) meth_sigs, meth_vocab_list, pot_var_vocab_list, act_var_vocab_list = vocab_data print "Read vocab data" vocab_file.close() if var_voc == "act": var_vocab_list = act_var_vocab_list pot_var_vocab_list = None else: var_vocab_list = pot_var_vocab_list act_var_vocab_list = None count_file = open(os.path.join(data_path, counts_name), 'rb') count_data = pickle.load(count_file) meth_count, pot_count, act_count = count_data print "Read count data" count_file.close() if var_voc == "act": var_count = act_count pot_count = None else: var_count = pot_count act_count = None if type(meth_prob) == int: meth_dat = [meth_prob, meth_vocab_list] meth_dat.extend(meth_count) prob_mode = "ngram" else: meth_count = None memm_file = open(os.path.join(data_path, memm_name), 'rb') meth_dat = pickle.load(memm_file) meth_dat = [meth_dat, meth_vocab_list] print "Read MEMM" prob_mode = "MEMM" memm_file.close() imp, fi, sents = seq.getSents(cu, i, seq_mode) word_ll = {} noUnk = True for word in meth_vocab_list: if type(word) is not str and word[0].split('$')[0] in i: ll = 0 ctr = 0 for sent, vl in sents: newsent = [] for stat, ctx in sent: if stat == "UNK": newsent.append((word, ctx)) noUnk = False else: f = stat[1] n = len(stat) - 2 newsent.append(((f, n), ctx)) newsent = seq.getFeatures([newsent])[0] newsent.insert(0, "<S1>") newsent.insert(0, "<S2>") newsent.append("<END>") if type(meth_prob) == int: ctr += len(newsent) - 2 else: ctr += len(newsent) - 3 ll += seqProb(newsent, meth_dat, prob_mode) ll /= ctr word_ll[word] = ll word_ll = sorted(word_ll.items(), key=operator.itemgetter(1), reverse = True) top_guess = {} i = 0 if noUnk: ll = 0 vsents = seq.getVarSents2(sents) var_dat = [var_prob, var_vocab_list] var_dat.extend(var_count) for sen in vsents: sen.insert(0, "<S1>") sen.insert(0, "<S2>") sen.append("<END>") ctr += len(sen) - 2 #print sen ll += varSeqProb(sen, var_dat) top_guess = [[["NILL"], ll / ctr]] else: if fill == "max": len_guess = 20 else: len_guess = 1 num_guess = 1 while i < len(word_ll) and len(top_guess) < len_guess: if fill == "max": w = word_ll[i][0] else: w = random.choice(word_ll)[0] i += 1 var_guess = var_guesses(w, cu, meth_sigs, num_guess) if len(var_guess) > 0: var_dat = [var_prob, var_vocab_list] var_dat.extend(var_count) for call in var_guess: ll = 0 ctr = 0 vsents = [] for sent, vl in sents: newsent = [] for stat, ctx in sent: if stat == "UNK": newsent.append((call, ctx)) else: newsent.append((stat, ctx)) vsents.append([newsent, vl]) vsents = seq.getVarSents2(vsents) for sen in vsents: sen.insert(0, "<S1>") sen.insert(0, "<S2>") sen.append("<END>") ctr += len(sen) - 2 #print sen ll += varSeqProb(sen, var_dat) top_guess[call] = ll / ctr top_guess = sorted(top_guess.items(), key=operator.itemgetter(1), reverse = True) return top_guess