def getLabeledSentence(data_set_name, outfileName): labelGrammer = createDegreeGrammar() data = datautils.loadJson(data_set_name) pattern1 = ["DE_LEVEL", StarRepetition([",","DE_LEVEL"]), QuestionRepetition(["OR","DE_LEVEL"]),"DEGREE" ] fst = TokenRegex(pattern1) matchSum = 0 f = open(outfileName, "w") for item in data: # print item words = item[2].split() degreeSent = JobSentence(words) labelGrammer.labelSentence(degreeSent) labeledArray = degreeSent.getLabeledArray() array = [x[0] for x in labeledArray ] print item[0], ": " ,array match = fst.match(array) print "match=", match if match : matchSum += 1 # printTrack(track) f.write ( item[0] + "\n\n") table = degreeSent.printLabeledArray() # f.write( table.get_string() + "\n\n" ) print "match rate =" , str(matchSum)+"/"+str(len(data)) + "=", matchSum/len(data)
def processDegreeSet(data_set_name, outfileName,failfilename): data = datautils.loadJson(data_set_name) f = open(outfileName, "w") f2 = open(failfilename, "w") total = 0 m = 0 for item in data: # print item sent = item[2] sid = item[0] degreeSent, degreeModel = extractInfo(sent) print sid , degreeModel if len(degreeModel) > 0 : m+=1 f.write( sent +"\n\n" ) f.write( degreeSent.printLabeledArray().get_string() +"\n\n" ) f.write( str(degreeModel) +"\n\n" ) else : f2.write( sent +"\n\n" ) f2.write( degreeSent.printLabeledArray().get_string() +"\n\n" ) total += 1 f2.write( "\n\n match="+ str( m) + " total="+ str( total) + " radio=" + str (float(m)/total) ) print "match=", m, " total=", total, " radio=", float(m)/total
def processDegreeSet(data_set_name, outfileName, failfilename): data = datautils.loadJson(data_set_name) f = open(outfileName, "w") f2 = open(failfilename, "w") total = 0 m = 0 for item in data: # print item sent = item[2] sid = item[0] degreeSent, degreeModel = extractInfo(sent) print sid, degreeModel if len(degreeModel) > 0: m += 1 f.write(sent + "\n\n") f.write(degreeSent.printLabeledArray().get_string() + "\n\n") f.write(str(degreeModel) + "\n\n") else: f2.write(sent + "\n\n") f2.write(degreeSent.printLabeledArray().get_string() + "\n\n") total += 1 f2.write("\n\n match=" + str(m) + " total=" + str(total) + " radio=" + str(float(m) / total)) print "match=", m, " total=", total, " radio=", float(m) / total
def getLabeledSentence(data_set_name, outfileName): labelGrammer = createDegreeGrammar() data = datautils.loadJson(data_set_name) pattern1 = [ "DE_LEVEL", StarRepetition([",", "DE_LEVEL"]), QuestionRepetition(["OR", "DE_LEVEL"]), "DEGREE" ] fst = TokenRegex(pattern1) matchSum = 0 f = open(outfileName, "w") for item in data: # print item words = item[2].split() degreeSent = JobSentence(words) labelGrammer.labelSentence(degreeSent) labeledArray = degreeSent.getLabeledArray() array = [x[0] for x in labeledArray] print item[0], ": ", array match = fst.match(array) print "match=", match if match: matchSum += 1 # printTrack(track) f.write(item[0] + "\n\n") table = degreeSent.printLabeledArray() # f.write( table.get_string() + "\n\n" ) print "match rate =", str(matchSum) + "/" + str( len(data)) + "=", matchSum / len(data)
def preProcess(data_set_name, target_set_name): max_length = 200 data = datautils.loadJson(data_set_name) newdata = [] for item in data: if len(item[1]) < max_length: item.append(preProcessFun(item[1])) item[1] = len(item[2].split()) newdata.append(item) newdata = sorted(newdata, key=operator.itemgetter(1)) datautils.dumpTwo(newdata, target_set_name, dumpLam2)
def preProcess(data_set_name, target_set_name): max_length = 200 data = datautils.loadJson(data_set_name) newdata = [] for item in data: if len (item[1] ) < max_length : item.append ( preProcessFun(item[1]) ) item[1] = len(item[2].split()) newdata.append(item) newdata = sorted(newdata, key=operator.itemgetter(1) ) datautils.dumpTwo(newdata, target_set_name, dumpLam2)
def labelDegreeSet(data_set_name, outfileName): data = datautils.loadJson(data_set_name) f = open(outfileName, "w") total = 0 for item in data: # print item sent = item[2] # sid = item[0] print sent labeledSent = labelSent(sent) # print labeledSent.getCrfFormat() f.write(labeledSent.getCrfFormat()) total += 1
def labelDegreeSet( data_set_name, outfileName ): data = datautils.loadJson(data_set_name) f = open(outfileName, "w") total = 0 for item in data: # print item sent = item[2] # sid = item[0] print sent labeledSent = labelSent( sent ) # print labeledSent.getCrfFormat() f.write(labeledSent.getCrfFormat()) total += 1
def labelDegreeSet(matchers, data_set_name, outfileName,failfilename): for matcher in matchers: matcher.matchNum = 0 data = datautils.loadJson(data_set_name) f = open(outfileName, "w") f2 = open(failfilename, "w") total = 0 m = 0 for item in data: # print item sent = item[2] sid = item[0] matcher = None degreeSent, matcher = labelSentByMatchers(matchers, sent) if matcher is not None: output = matcher.output() found = matcher.found else: output = None found = None print sid ,found, output total += 1 if matcher is not None : m+=1 # print sent.encode("GBK", "ignore") f.write( sent.encode("GBK", "ignore") +"\n\n" ) f.write( degreeSent.printLabeledArray().get_string() +"\n\n" ) f.write( str(found) + " " + str(output) +"\n\n" ) else : f2.write( sent.encode("GBK", "ignore") +"\n\n" ) f2.write( degreeSent.printLabeledArray().get_string() +"\n\n" ) f2.write( "\n\n match="+ str( m) + " total="+ str( total) + " radio=" + str (float(m)/total) +"\n" ) print "match=", m, " total=", total, " radio=", float(m)/total i = 0 for matcher in matchers : i+=1 print "matcher ", i, ":", matcher.matchNum f2.write( "\n matcher " + str( i) + ":" + str( matcher.matchNum ) )
def labelDegreeSet(data_set_name, outfileName): labelGrammer = createDegreeGrammar() data = datautils.loadJson(data_set_name) f = open(outfileName, "w") for item in data: # print item words = item[2].split() degreeSent = JobSentence(words) labelGrammer.labelSentence(degreeSent) print item[0] f.write ( item[0] + "\n\n") table = degreeSent.printSentenct() # print table.get_string() + "\n\n" f.write( table.get_string() + "\n\n" )
def labelDegreeSet(data_set_name, outfileName): labelGrammer = createDegreeGrammar() data = datautils.loadJson(data_set_name) f = open(outfileName, "w") for item in data: # print item words = item[2].split() degreeSent = JobSentence(words) labelGrammer.labelSentence(degreeSent) print item[0] f.write(item[0] + "\n\n") table = degreeSent.printSentenct() # print table.get_string() + "\n\n" f.write(table.get_string() + "\n\n")
def labelExampleSet(data_set_name, outfileName, start, end): data = datautils.loadJson(data_set_name) f = open(outfileName, "w") total = 0 r = 100 for i in range(end - start): # print item item = data[i + start] sent = item[2] # sid = item[0] print sent labeledSent = labelSent(sent) # print labeledSent.getCrfFormat() f.write(labeledSent.getCrfFormat()) total += 1
def labelExampleSet( data_set_name, outfileName, start, end ): data = datautils.loadJson(data_set_name) f = open(outfileName, "w") total = 0 r = 100 for i in range(end-start): # print item item = data[i+start] sent = item[2] # sid = item[0] print sent labeledSent = labelSent( sent ) # print labeledSent.getCrfFormat() f.write(labeledSent.getCrfFormat()) total += 1
def beforeDegree(): data_set_name = "degree_1" data = datautils.loadJson(data_set_name) dict1 = {} for item in data: words = item[1].lower().split() i = findToken("degree", words) if (i != -1): if i == 0: term = "__NO__" else: term = words[i - 1] # print term.encode("GBK", "ignore") if dict1.has_key(term): dict1[term] += 1 else: dict1[term] = 1 # print term.encode("GBK", "ignore") datautils.printStatDict(dict1)
def labelMajorSet(data_set_name, outfileName, start, num): data = datautils.loadJson(data_set_name) f = open(outfileName, "w") total = 0 r = 100 i = 0 while i < num: # print item item = data[i + start] sent = item[2] # sid = item[0] print sent labeledSent = labelSent(sent) # print labeledSent.getCrfFormat() f.write(labeledSent.getCrfFormat()) total += 1
def beforeDegree(): data_set_name = "degree_1" data = datautils.loadJson(data_set_name) dict1 = {} for item in data: words = item[1].lower().split() i = findToken("degree", words) if ( i != -1 ) : if i == 0 : term = "__NO__" else: term = words[i-1] # print term.encode("GBK", "ignore") if dict1.has_key(term): dict1[term]+=1 else : dict1[term]=1 # print term.encode("GBK", "ignore") datautils.printStatDict(dict1)
def labelMajorSet( data_set_name, outfileName, start, num ): data = datautils.loadJson(data_set_name) f = open(outfileName, "w") total = 0 r = 100 i = 0 while i < num: # print item item = data[i+start] sent = item[2] # sid = item[0] print sent labeledSent = labelSent( sent ) # print labeledSent.getCrfFormat() f.write(labeledSent.getCrfFormat()) total += 1
def filterTerms(): wordset = buildTerms() data = datautils.loadJson("term3") worddict = {} for item in data: # print item sent = item[1] sid = item[0] tokens = sent.lower().split() for token in tokens : if not token in wordset: if worddict.has_key(token) : worddict[token] += 1 else : worddict[token] = 1 sorted_x = sorted(worddict.iteritems(), key=operator.itemgetter(1)) f = open("unknowns.txt", "w") for key, value in sorted_x: print key.encode("GBK", "ignore"), value f.write(key.encode("GBK", "ignore") + " " + str( value ) + "\n" )