示例#1
0
def getLabeledSentence(data_set_name, outfileName):    
    labelGrammer =  createDegreeGrammar()        
    data = datautils.loadJson(data_set_name)
    
    pattern1 = ["DE_LEVEL", StarRepetition([",","DE_LEVEL"]), QuestionRepetition(["OR","DE_LEVEL"]),"DEGREE" ]
    fst = TokenRegex(pattern1) 
    
    matchSum = 0 
    f = open(outfileName, "w")
    for item in data:
    #    print item
        words = item[2].split()
        degreeSent = JobSentence(words)
        labelGrammer.labelSentence(degreeSent)
        labeledArray = degreeSent.getLabeledArray()
        array = [x[0] for x in labeledArray ]
        print item[0], ":  " ,array
        match = fst.match(array) 
        print "match=", match
        if match :
            matchSum += 1 
      #  printTrack(track)
        f.write (  item[0] + "\n\n")         
        table = degreeSent.printLabeledArray()  
    #    f.write( table.get_string()  + "\n\n" )      
    print "match rate =" , str(matchSum)+"/"+str(len(data)) + "=", matchSum/len(data)
示例#2
0
def processDegreeSet(data_set_name, outfileName,failfilename):
     
    data = datautils.loadJson(data_set_name)
   
    f = open(outfileName, "w")
    f2 = open(failfilename, "w")
    total = 0
    m = 0
    for item in data:
    #    print item
        sent = item[2]    
        sid = item[0]         
       
        degreeSent, degreeModel = extractInfo(sent) 
        
        print sid , degreeModel
        if len(degreeModel) > 0 :
            m+=1
            f.write( sent +"\n\n" )
            f.write( degreeSent.printLabeledArray().get_string() +"\n\n" )
            f.write( str(degreeModel)   +"\n\n" )
        else :
            f2.write( sent +"\n\n" )
            f2.write( degreeSent.printLabeledArray().get_string() +"\n\n" )
            
            
        total += 1
        
            
    f2.write( "\n\n match="+ str( m) + "  total="+ str( total) + "  radio=" + str (float(m)/total) )
             
    print "match=", m, "  total=", total, "  radio=", float(m)/total
示例#3
0
def processDegreeSet(data_set_name, outfileName, failfilename):

    data = datautils.loadJson(data_set_name)

    f = open(outfileName, "w")
    f2 = open(failfilename, "w")
    total = 0
    m = 0
    for item in data:
        #    print item
        sent = item[2]
        sid = item[0]

        degreeSent, degreeModel = extractInfo(sent)

        print sid, degreeModel
        if len(degreeModel) > 0:
            m += 1
            f.write(sent + "\n\n")
            f.write(degreeSent.printLabeledArray().get_string() + "\n\n")
            f.write(str(degreeModel) + "\n\n")
        else:
            f2.write(sent + "\n\n")
            f2.write(degreeSent.printLabeledArray().get_string() + "\n\n")

        total += 1

    f2.write("\n\n match=" + str(m) + "  total=" + str(total) + "  radio=" +
             str(float(m) / total))

    print "match=", m, "  total=", total, "  radio=", float(m) / total
示例#4
0
def getLabeledSentence(data_set_name, outfileName):
    labelGrammer = createDegreeGrammar()
    data = datautils.loadJson(data_set_name)

    pattern1 = [
        "DE_LEVEL",
        StarRepetition([",", "DE_LEVEL"]),
        QuestionRepetition(["OR", "DE_LEVEL"]), "DEGREE"
    ]
    fst = TokenRegex(pattern1)

    matchSum = 0
    f = open(outfileName, "w")
    for item in data:
        #    print item
        words = item[2].split()
        degreeSent = JobSentence(words)
        labelGrammer.labelSentence(degreeSent)
        labeledArray = degreeSent.getLabeledArray()
        array = [x[0] for x in labeledArray]
        print item[0], ":  ", array
        match = fst.match(array)
        print "match=", match
        if match:
            matchSum += 1
    #  printTrack(track)
        f.write(item[0] + "\n\n")
        table = degreeSent.printLabeledArray()
    #    f.write( table.get_string()  + "\n\n" )
    print "match rate =", str(matchSum) + "/" + str(
        len(data)) + "=", matchSum / len(data)
示例#5
0
def preProcess(data_set_name, target_set_name):

    max_length = 200
    data = datautils.loadJson(data_set_name)
    newdata = []
    for item in data:
        if len(item[1]) < max_length:
            item.append(preProcessFun(item[1]))
            item[1] = len(item[2].split())
            newdata.append(item)
    newdata = sorted(newdata, key=operator.itemgetter(1))
    datautils.dumpTwo(newdata, target_set_name, dumpLam2)
示例#6
0
def preProcess(data_set_name, target_set_name):
    
    max_length = 200
    data = datautils.loadJson(data_set_name)    
    newdata = []
    for item in data:
        if len (item[1] ) < max_length : 
            item.append ( preProcessFun(item[1]) )
            item[1] = len(item[2].split())
            newdata.append(item)
    newdata = sorted(newdata, key=operator.itemgetter(1) )
    datautils.dumpTwo(newdata, target_set_name, dumpLam2)    
示例#7
0
def labelDegreeSet(data_set_name, outfileName):

    data = datautils.loadJson(data_set_name)

    f = open(outfileName, "w")
    total = 0
    for item in data:
        #  print item
        sent = item[2]
        #   sid = item[0]

        print sent
        labeledSent = labelSent(sent)
        #  print labeledSent.getCrfFormat()
        f.write(labeledSent.getCrfFormat())
        total += 1
示例#8
0
def labelDegreeSet( data_set_name, outfileName ):
    
    data = datautils.loadJson(data_set_name)
   
    f = open(outfileName, "w")     
    total = 0    
    for item in data:
      #  print item
        sent = item[2]    
     #   sid = item[0]        
        
        print sent 
        labeledSent = labelSent( sent )
      #  print labeledSent.getCrfFormat()
        f.write(labeledSent.getCrfFormat())
        total += 1
示例#9
0
def labelDegreeSet(matchers, data_set_name, outfileName,failfilename):
   
    for matcher in matchers:       
            matcher.matchNum = 0     
     
    data = datautils.loadJson(data_set_name)
   
    f = open(outfileName, "w")
    f2 = open(failfilename, "w")
    total = 0
    m = 0
    for item in data:
    #    print item
        sent = item[2]    
        sid = item[0]         
        matcher = None
        degreeSent, matcher = labelSentByMatchers(matchers, sent) 
     
        if matcher is not None:
            output = matcher.output()
            found = matcher.found
        else:
            output = None
            found = None
        
        print sid ,found, output 
        total += 1
        if matcher is not None :
            m+=1
       #     print sent.encode("GBK", "ignore")

            f.write( sent.encode("GBK", "ignore") +"\n\n" )
            f.write( degreeSent.printLabeledArray().get_string() +"\n\n" )
            f.write( str(found) + "   " + str(output) +"\n\n" )
        else :
            f2.write( sent.encode("GBK", "ignore") +"\n\n" )
            f2.write( degreeSent.printLabeledArray().get_string() +"\n\n" )
             
    f2.write( "\n\n match="+ str( m) + "  total="+ str( total) + "  radio=" + str (float(m)/total) +"\n" )
             
    print "match=", m, "  total=", total, "  radio=", float(m)/total
    
    i = 0
    for matcher in matchers :
        i+=1
        print "matcher ", i, ":", matcher.matchNum
        f2.write( "\n matcher " + str( i) + ":" + str( matcher.matchNum ) )
示例#10
0
def labelDegreeSet(data_set_name, outfileName):
    labelGrammer =  createDegreeGrammar()        
    data = datautils.loadJson(data_set_name)
     
    f = open(outfileName, "w")
    for item in data:
    #    print item
        words = item[2].split()
        degreeSent = JobSentence(words)
        labelGrammer.labelSentence(degreeSent)
       
        print item[0]
        f.write (  item[0] + "\n\n") 
        
        table = degreeSent.printSentenct()  
   #     print table.get_string() + "\n\n"
        f.write( table.get_string()  + "\n\n" )        
示例#11
0
def labelDegreeSet(data_set_name, outfileName):
    labelGrammer = createDegreeGrammar()
    data = datautils.loadJson(data_set_name)

    f = open(outfileName, "w")
    for item in data:
        #    print item
        words = item[2].split()
        degreeSent = JobSentence(words)
        labelGrammer.labelSentence(degreeSent)

        print item[0]
        f.write(item[0] + "\n\n")

        table = degreeSent.printSentenct()
        #     print table.get_string() + "\n\n"
        f.write(table.get_string() + "\n\n")
示例#12
0
def labelExampleSet(data_set_name, outfileName, start, end):

    data = datautils.loadJson(data_set_name)

    f = open(outfileName, "w")
    total = 0
    r = 100
    for i in range(end - start):
        #  print item
        item = data[i + start]
        sent = item[2]
        #   sid = item[0]

        print sent
        labeledSent = labelSent(sent)
        #  print labeledSent.getCrfFormat()
        f.write(labeledSent.getCrfFormat())
        total += 1
示例#13
0
def labelExampleSet( data_set_name, outfileName, start, end ):
    
    data = datautils.loadJson(data_set_name)
   
    f = open(outfileName, "w")     
    total = 0    
    r = 100
    for i in range(end-start):
      #  print item
        item = data[i+start]
        sent = item[2]    
     #   sid = item[0]        
        
        print sent 
        labeledSent = labelSent( sent )
      #  print labeledSent.getCrfFormat()
        f.write(labeledSent.getCrfFormat())
        total += 1
示例#14
0
def beforeDegree():
    data_set_name = "degree_1"
    data = datautils.loadJson(data_set_name)
    dict1 = {}
    for item in data:
        words = item[1].lower().split()
        i = findToken("degree", words)
        if (i != -1):
            if i == 0:
                term = "__NO__"
            else:
                term = words[i - 1]
    #  print term.encode("GBK", "ignore")
        if dict1.has_key(term):
            dict1[term] += 1
        else:
            dict1[term] = 1
    #  print term.encode("GBK", "ignore")
    datautils.printStatDict(dict1)
示例#15
0
def labelMajorSet(data_set_name, outfileName, start, num):

    data = datautils.loadJson(data_set_name)

    f = open(outfileName, "w")
    total = 0
    r = 100
    i = 0
    while i < num:
        #  print item
        item = data[i + start]
        sent = item[2]
        #   sid = item[0]

        print sent
        labeledSent = labelSent(sent)
        #  print labeledSent.getCrfFormat()
        f.write(labeledSent.getCrfFormat())
        total += 1
示例#16
0
def  beforeDegree():
    data_set_name = "degree_1"       
    data = datautils.loadJson(data_set_name)
    dict1 = {}
    for item in data:
        words = item[1].lower().split()
        i = findToken("degree", words)
        if ( i != -1 ) :
            if i == 0 :
                term = "__NO__"
            else: 
                term = words[i-1]
      #  print term.encode("GBK", "ignore")
        if dict1.has_key(term):
            dict1[term]+=1
        else :
            dict1[term]=1
      #  print term.encode("GBK", "ignore")
    datautils.printStatDict(dict1)
示例#17
0
def labelMajorSet( data_set_name, outfileName, start, num ):
    
    data = datautils.loadJson(data_set_name)
   
    f = open(outfileName, "w")     
    total = 0    
    r = 100
    i = 0 
    while i < num:
      #  print item
        item = data[i+start]
        sent = item[2]    
     #   sid = item[0]        
        
        print sent 
        labeledSent = labelSent( sent )
      #  print labeledSent.getCrfFormat()
        f.write(labeledSent.getCrfFormat())
        total += 1
示例#18
0
def filterTerms():    
    wordset =  buildTerms()   
    data = datautils.loadJson("term3")
    worddict = {}
    for item in data:
     #   print item
        sent = item[1]    
        sid = item[0]         
        tokens = sent.lower().split()
        for token in tokens :
            if not token in wordset:
                if worddict.has_key(token) :
                    worddict[token] += 1
                else :
                    worddict[token] = 1
                   
    sorted_x = sorted(worddict.iteritems(), key=operator.itemgetter(1))
    
    f = open("unknowns.txt", "w")
    for key, value in sorted_x:
         print key.encode("GBK", "ignore"), value
         f.write(key.encode("GBK", "ignore") + "  " + str( value ) + "\n" )