示例#1
0
def WriteCluster(excelfile, folder, np=None):
    sheets = range(0, maxWeek)

    for type in ['q1', 'q2', 'q3', 'q4']:
        for sheet in sheets:
            week = sheet + 1
            student_summaryList = CourseMirror_Survey.getStudentResponseList(
                excelfile, course, week, type, withSource=True)
            if len(student_summaryList) == 0: continue

            path = folder + str(week) + '/'
            fio.NewPath(path)

            path = path + type + '/'
            fio.NewPath(path)
            filename = path + type + '.cluster'

            #create a XML file
            root = ET.Element(tag='CLUSTER', attrib={'LANG': "ENG"})
            root.tail = '\n'
            tree = ET.ElementTree(root)

            DID = str(sheet + 1) + '_' + type

            node = ET.Element(tag='D', attrib={'DID': str(DID)})
            node.tail = '\n'
            root.append(node)

            tree.write(filename)
示例#2
0
def WriteDocsent(excelfile, folder, phrasedir, np=None):
    sheets = range(0, maxWeek)

    for i, sheet in enumerate(sheets):
        week = i + 1

        for type in ['q1', 'q2', 'q3', 'q4']:

            phrasefile = os.path.join(phrasedir, str(week),
                                      type + '.' + method + '.key')
            if not fio.IsExist(phrasefile): continue

            print phrasefile

            DID = str(week) + '_' + type

            path = folder + str(week) + '/'
            fio.NewPath(path)
            path = path + type + '/'
            fio.NewPath(path)
            path = path + 'docsent/'
            fio.NewPath(path)
            filename = path + DID + '.docsent'

            #create a XML file
            root = ET.Element(tag='DOCSENT',
                              attrib={
                                  'DID': DID,
                                  'LANG': "ENG"
                              })
            root.tail = '\n'
            tree = ET.ElementTree(root)

            phrases = fio.ReadFileUTF8(phrasefile)

            sno_id = 1
            for par, phrase in enumerate(phrases):
                phrase = phrase.rstrip()
                s = [phrase]

                for RSNT, value in enumerate(s):
                    node = ET.Element(tag='S',
                                      attrib={
                                          'PAR': str(par + 1),
                                          'RSNT': str(RSNT + 1),
                                          'SNO': str(sno_id)
                                      })
                    node.text = value
                    node.tail = '\n'
                    root.append(node)
                    sno_id = sno_id + 1

            tree.write(filename)
示例#3
0
def test_cross_course(train, name='all'):
    wapiti_home = global_params.wapiti_dir

    pattern_file = '../data/%s.pattern.txt' % name
    model_dir = '../data/%s/%s/model/%s/' % (course, system, name)
    fio.NewPath(model_dir)

    feature_dir = '../data/%s/%s/extraction/' % (course, system)
    feature_cv_dir = '../data/%s/%s/extraction/%s/' % (course, system, name)
    fio.NewPath(feature_cv_dir)

    outputdir = '../data/%s/%s/extraction/%s_output/' % (course, system, name)
    fio.NewPath(outputdir)

    lectures = annotation.Lectures

    dict = defaultdict(int)

    for i, lec in enumerate(lectures):
        test = [lec]
        model_file = os.path.join(model_dir, '%s.model' % train)

        print model_file

        crf = CRF(wapiti_home)
        if not fio.IsExist(model_file):
            print "Model is not available"

        for q in ['q1', 'q2']:

            test_filename = os.path.join(feature_cv_dir,
                                         'test_%d_%s.feature.crf' % (i, q))
            output_file = os.path.join(outputdir, 'test_%d_%s.out' % (i, q))

            dict['test_%d_%s' % (i, q)] = 1

            if method == 'combine':
                test_filename_old = test_filename.replace('_combine', '_A1')
                cmd = 'cp %s %s' % (test_filename_old, test_filename)
                os.system(cmd)
            else:
                combine_files(feature_dir, test, test_filename, prompts=[q])

            crf.predict(test_filename, model_file, output_file)

        if debug: break

    file_util.save_dict2json(dict, class_index_dict_file)
示例#4
0
def train_leave_one_lecture_out_NP(name='cv'):
    feature_dir = '../data/%s/%s/extraction/' % (course, system)

    outputdir = '../data/%s/%s/extraction/%s_output/' % (course, system, name)
    fio.NewPath(outputdir)

    lectures = annotation.Lectures

    dict = defaultdict(int)

    for i, lec in enumerate(lectures):
        train = [x for x in lectures if x != lec]
        test = [lec]

        for q in ['q1', 'q2']:

            output_file = os.path.join(outputdir, 'test_%d_%s.out' % (i, q))

            dict['test_%d_%s' % (i, q)] = 1

            combine_files(feature_dir, test, output_file, prompts=[q])

        if debug: break

    file_util.save_dict2json(dict, class_index_dict_file)
示例#5
0
def extractPhraseFromCRFWithColor(phrasedir, systemdir):
    crf_reader = CRF()
    aligner = AlignPhraseAnnotation()

    lectures = annotation.Lectures
    for i, lec in enumerate(lectures):
        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            filename = path + prompt + '.' + method + '.key'
            extracted_phrases = []
            extracted_colors = []

            crf_file = os.path.join(systemdir, 'extraction', 'all_output',
                                    'test_%i_%s.out' % (i, prompt))
            for tokens, tags, color0, color1 in crf_reader.read_file_generator_index(
                    crf_file, [0, -1, -4, -3]):
                phrases, phrase_colors = aligner.get_phrase_with_colors(
                    tokens, tags, [color0, color1])

                for phrase, phrase_color in zip(phrases, phrase_colors):

                    extracted_phrases.append(phrase.lower())
                    extracted_colors.append(phrase_color)

            fio.SaveList(extracted_phrases, filename)

            filename = path + prompt + '.' + method + '.key.color'
            fio.SaveDict2Json(extracted_colors, filename)
示例#6
0
def extractPhrase(excelfile, folder, sennadatadir, method):
    sheets = range(0, maxWeek)

    for i, sheet in enumerate(sheets):
        week = i + 1

        for type in ['q1', 'q2', 'q3', 'q4']:
            #for type in ['POI', 'MP']:
            print excelfile, sheet, type
            student_summaryList = CourseMirror_Survey.getStudentResponseList(
                excelfile, course, week, type, withSource=False)
            if len(student_summaryList) == 0: continue

            path = folder + str(week) + '/'
            fio.NewPath(path)
            filename = path + type + '.' + method + '.key'

            sennafile = sennadatadir + "senna." + str(
                week) + "." + type + '.output'
            if not fio.IsExist(sennafile): continue

            phrases = getKeyPhrases(student_summaryList,
                                    sennafile,
                                    method=method,
                                    MalformedFlilter=True)

            fio.SaveList(phrases, filename)
示例#7
0
def extractPhrasePaireFeature(phrasedir):
    for lec in annotation.Lectures:
        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            prefix = os.path.join(path, '%s.%s.' % (prompt, method))
            filename = path + prompt + sim_exe
            print filename

            featureset = []

            feature_extractor = Similarity(prefix)

            phrasefile = os.path.join(path, "%s.%s.key" % (prompt, method))

            phrases = fio.LoadList(phrasefile)

            for p1 in phrases:
                for p2 in phrases:
                    featureset.append(
                        (feature_extractor.get_features(p1, p2), 0.0, {
                            'p1': p1,
                            'p2': p2
                        }))

            fio.SaveDict2Json(featureset, filename)

            feature_extractor.save()
示例#8
0
def correlation_analysis(course):
    phrasedir1 = '../data/%s/oracle_annotator_1/phrase/' % course
    phrasedir2 = '../data/%s/oracle_annotator_2/phrase/' % course

    outdir = '../data/%s/simlearning/' % course
    fio.NewPath(outdir)

    sim_extractor = Similarity()

    features = sorted(sim_extractor.features.keys())
    head = features + ['score', 'predict']
    body = []
    lectures = annotation.Lectures
    name = '_'.join(features)

    for i, lec in enumerate(lectures):

        model_file = os.path.join(model_dir, '%d_%s.model' % (lec, name))

        with open(model_file, 'rb') as handle:
            clf = pickle.load(handle)

        for q in ['q1', 'q2']:

            outfile = os.path.join(outdir, str(lec), '%s%s' % (q, sim_exe))

            for phrasedir in [phrasedir1, phrasedir2]:
                path = phrasedir + str(lec) + '/'

                filename = os.path.join(path, q + sim_exe)

                data = fio.LoadDictJson(filename)

                for fdict, score, _ in data:
                    row = []

                    for fname in features:
                        x = fdict[fname]

                        if str(x) == 'nan':
                            x = 0.0

                        row.append(x)

                    predict_score = clf.predict([row])

                    row.append(score)

                    row.append(predict_score[0])

                    body.append(row)

    out_correlation = os.path.join(outdir, 'data.txt')

    print out_correlation
    fio.WriteMatrix(out_correlation, body, head)
示例#9
0
def extractPhrasePaireFromAnnotation(phrasedir, annotators, id):
    for doc, lec, annotator in annotation.generate_all_files(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotators,
            lectures=annotation.Lectures):
        print doc

        #load task
        task = annotation.Task()
        task.loadjson(doc)

        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            prefix = os.path.join(path, '%s.%s.' % (prompt, method))
            filename = path + prompt + sim_exe
            print filename

            featureset = []

            feature_extractor = Similarity(prefix)

            phrase_annotation = task.get_phrase_annotation(prompt)

            #positive examples
            for rank1 in sorted(phrase_annotation):
                for rank2 in sorted(phrase_annotation):
                    if rank1 == rank2:
                        score = 1.0
                    else:
                        score = 0.0

                    phrases1 = phrase_annotation[rank1]
                    phrases2 = phrase_annotation[rank2]
                    for phrasedict1 in phrases1:
                        p1 = phrasedict1['phrase'].lower().strip()

                        for phrasedict2 in phrases2:
                            p2 = phrasedict2['phrase'].lower().strip()

                            featureset.append(
                                (feature_extractor.get_features(p1,
                                                                p2), score, {
                                                                    'p1': p1,
                                                                    'p2': p2
                                                                }))

            fio.SaveDict2Json(featureset, filename)

            feature_extractor.save()
示例#10
0
def PrepareIE256():
    cid = "IE256"
    maxWeek = 25

    excelfile = "../data/CourseMirror/Reflection.json"
    sennadir = "../../AbstractPhraseSummarization/data/IE256/senna/"

    #fio.NewPath(sennadir)
    #getStudentResponses4Senna(excelfile, cid, maxWeek, sennadir)

    outdirs = [  #'../../AbstractPhraseSummarization/data/IE256/ILP_Baseline_Sentence/',
        #'../../AbstractPhraseSummarization/data/IE256/MC/',
        #'../../AbstractPhraseSummarization/data/IE256/ILP_Sentence_MC/',
        '../../AbstractPhraseSummarization/data/IE256/ILP_Sentence_Supervised_FeatureWeightingAveragePerceptron/',
    ]

    sheets = range(1, maxWeek + 1)

    for outdir in outdirs:
        for sheet in sheets:
            week = sheet

            for type in ['q1', 'q2', 'q3', 'q4']:
                student_summaryList = getStudentResponseList(
                    excelfile, cid, week, type, True)
                if len(student_summaryList) == 0: continue

                path = os.path.join(outdir, str(week))
                fio.NewPath(path)

                source = {}
                responses = []
                count = defaultdict(int)
                for response, student in student_summaryList:
                    responses.append(response)
                    count[response] += 1

                    if response not in source:
                        source[response] = []
                    source[response].append(student)

                outout = os.path.join(path, type + ".sentence.key")
                fio.SaveList(set(responses), outout)

                output = os.path.join(path, type + '.sentence.keys.source')
                fio.SaveDict2Json(source, output)

                output = os.path.join(path, type + '.sentence.dict')
                fio.SaveDict(count, output)
示例#11
0
def train_on_course(traincourse, name='all'):
    wapiti_home = global_params.wapiti_dir

    pattern_file = '../data/%s.pattern.txt' % name
    model_dir = '../data/%s/%s/model/%s/' % (course, system, name)
    fio.NewPath(model_dir)

    feature_dir = '../data/%s/%s/extraction/' % (traincourse, system)
    feature_cv_dir = '../data/%s/%s/extraction/%s/' % (traincourse, system,
                                                       name)
    fio.NewPath(feature_cv_dir)

    outputdir = '../data/%s/%s/extraction/%s_output/' % (course, system, name)
    fio.NewPath(outputdir)

    if traincourse == 'IE256':
        lectures = [x for x in range(14, 26) if x != 22]
    else:
        lectures = [x for x in range(3, 27)]

    dict = defaultdict(int)

    train = [x for x in lectures]

    train_filename = os.path.join(feature_cv_dir, 'train.feature.crf')

    model_file = os.path.join(model_dir, '%s.model' % traincourse)

    print train_filename
    print model_file

    crf = CRF(wapiti_home)
    if not fio.IsExist(model_file):
        #if True:
        combine_files(feature_dir, train, train_filename)
        crf.train(train_filename, pattern_file, model_file)
示例#12
0
def correlation_analysis_noduplicate():
    phrasedir1 = '../data/%s/oracle_annotator_1/phrase/' % course
    phrasedir2 = '../data/%s/oracle_annotator_2/phrase/' % course

    outdir = '../data/%s/simlearning/' % course
    fio.NewPath(outdir)

    sim_extractor = Similarity()

    features = sorted(sim_extractor.features.keys())
    head = features + ['score']
    body = []
    lectures = annotation.Lectures

    for i, lec in enumerate(lectures):
        for q in ['q1', 'q2']:

            outfile = os.path.join(outdir, str(lec), '%s%s' % (q, sim_exe))

            for phrasedir in [phrasedir1, phrasedir2]:
                path = phrasedir + str(lec) + '/'

                filename = os.path.join(path, q + sim_exe)

                data = fio.LoadDictJson(filename)

                for fdict, score, pd in data:
                    if pd['p1'] == pd['p2']:
                        print pd['p1']
                        continue

                    row = []

                    for name in features:
                        x = fdict[name]

                        if str(x) == 'nan':
                            x = 0.0

                        row.append(x)
                    row.append(score)

                    body.append(row)

    out_correlation = os.path.join(outdir, 'data.txt')
    fio.WriteMatrix(out_correlation, body, head)
示例#13
0
def get_phrase_reference_summary_phrase(outputs = None):
    
    for output in outputs:
        fio.NewPath(output)
        
        counts = []
        for doc, lec, annotator in annotation.generate_all_files(annotation.datadir + 'json/', '.json', anotators = annotation.anotators, lectures=annotation.Lectures):
            print doc
            
            task = annotation.Task()
            task.loadjson(doc)
            
            sub_tasks = task.get_tasks()
            
            for sub_task in sub_tasks:
                if sub_task["task_name"] == "Phrase":
                    if sub_task['prompt'] == 0: #POI
                        type = 'q1'
                    else: 
                        type = 'q2'
                    
                    summary_filename = os.path.join(output, str(lec), type+'.ref.' + str(annotation.anotator_dict[annotator])) 
                    #summary_filename = os.path.join(output, str(lec), type+'.ref.summary') 
                    
                    print summary_filename
                    
                    summaries = [row[1] for row in sub_task["summary"][1:]]
                    colors = [row[0].strip()[1] for row in sub_task["summary"][1:]]
                    student_numbers = [row[2].strip() for row in sub_task["summary"][1:]]
                    
                    count = 0
                    for summary in summaries:
                        count += len(NLTKWrapper.wordtokenizer(summary))
                    
                    counts.append(count)
                    fio.SaveList(summaries, summary_filename)
                    
                    color_filename = os.path.join(output, str(lec), '%s.ref.%s.color'%(type, str(annotation.anotator_dict[annotator])))
                    fio.SaveList(colors, color_filename)
                    
                    no_filename = os.path.join(output, str(lec), '%s.ref.%s.no'%(type, str(annotation.anotator_dict[annotator])))
                    fio.SaveList(student_numbers, no_filename)
        
        print counts
        print numpy.mean(counts)
        print numpy.median(counts)
示例#14
0
def extractPhraseFromCRF(phrasedir, systemdir):
    crf_reader = CRF()
    aligner = AlignPhraseAnnotation()

    lectures = annotation.Lectures
    for i, lec in enumerate(lectures):
        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            filename = path + prompt + '.' + method + '.key'
            phrases = []

            crf_file = os.path.join(systemdir, 'extraction', 'all_output',
                                    'test_%i_%s.out' % (i, prompt))
            for tokens, tags in crf_reader.read_file_generator(crf_file):
                for phrase in aligner.get_phrase(tokens, tags):
                    phrases.append(phrase.lower())

            fio.SaveList(phrases, filename)
示例#15
0
def extractPhraseFromAnnotationIntersect(phrasedir, annotators):
    for docs in annotation.generate_all_files_by_annotators(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotators,
            lectures=annotation.Lectures):

        doc0, lec0, annotator0 = docs[0]
        doc1, lec1, annotator1 = docs[1]

        assert (lec0 == lec1)
        lec = lec0

        #load tasks
        task0 = annotation.Task()
        task0.loadjson(doc0)

        task1 = annotation.Task()
        task1.loadjson(doc1)

        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            filename = path + prompt + '.' + method + '.key'
            print filename

            extracted_phrases = []
            phrase_annotation0 = task0.get_phrase_annotation(prompt)
            phrase_annotation1 = task1.get_phrase_annotation(prompt)

            aligner = AlignPhraseAnnotation(task0, task1, prompt)
            aligner.align()
            extracted_phrases = aligner.get_intersect()

            fio.SaveList(extracted_phrases, filename)
示例#16
0
    fio.WriteMatrix(output, allbody, allhead)


def check_stopword():
    from CourseMirror_Survey import stopwords

    vocab = fio.LoadDictJson(global_params.vocab)

    for word, count in vocab.items():
        if count < 5: continue

        if word in stopwords:
            print word, '\t', count


if __name__ == '__main__':
    course = global_params.g_cid

    for system, method in [
        ('QPS_combine', 'crf'),
    ]:
        phrasedir = "../data/" + course + "/" + system + "/phrase/"

        #         extractPhrasePaireFeature(phrasedir)

        model_dir = "../data/" + course + "/simlearning/svm"
        fio.NewPath(model_dir)

        train_leave_one_lecture_out_svm(model_dir)
        predict_leave_one_lecture_out(model_dir, phrasedir, modelname='svm')
示例#17
0
                    if response not in source:
                        source[response] = []
                    source[response].append(student)

                outout = os.path.join(path, type + ".sentence.key")
                fio.SaveList(set(responses), outout)

                output = os.path.join(path, type + '.sentence.keys.source')
                fio.SaveDict2Json(source, output)

                output = os.path.join(path, type + '.sentence.dict')
                fio.SaveDict(count, output)
    #write human summary


if __name__ == '__main__':
    #PrepareIE256()
    #exit(0)

    cid = sys.argv[1]
    maxWeek = int(sys.argv[2])

    #     cid = 'CS0445'
    #     maxWeek = 28

    excelfile = "../data/CourseMirror/reflections.json"
    annotation_dir = "../data/Annotation/" + cid + '/'
    sennadir = "../data/" + cid + "/senna/"
    fio.NewPath(sennadir)
    getStudentResponses4Senna(excelfile, cid, maxWeek, sennadir)
示例#18
0
    fio.WriteMatrix(datadir + "summary.txt", body, head)
                        
if __name__ == '__main__':
    
    
    
    course = sys.argv[1]
    maxWeek = int(sys.argv[2])
    system = sys.argv[3]
    method = sys.argv[4]
    similarity = sys.argv[5] 
    K = int(sys.argv[6])
    
    excelfile = "../data/CourseMIRROR/reflections.json"
              
    clusterdir = "../data/"+course+"/"+system+"/phrase/"
    fio.NewPath(clusterdir)
      
    datadir = "../data/"+course+"/"+system+"/PhraseMead/"
    GetLexRankScore(datadir, method, clusterdir)
          
    for ratio in [K]:
        for lex in ['lexrankmax']:
            datadir = "../data/"+course+"/"+system+ '/ClusterARank/'   
            #fio.DeleteFolder(datadir)
            ShallowSummary(excelfile, datadir, clusterdir, K=5, method = method, similarity=similarity, ratio=ratio, lex=lex)

                #PrintClusterRankSummary(datadir)
    
    print "done"
    
示例#19
0
def GetLexRankScore(datadir, np, outputdir):
    sheets = range(0, maxWeek)
    
    for type in ['q1', 'q2', 'q3', 'q4']:
        for sheet in sheets:
            week = sheet + 1
            
            DID = str(week) + '_' + type
            
            phrases = []
            scores = []
    
            #read Docsent
            path = datadir + str(week)+ '/'
            path = path + type + '/'
            path = path + 'docsent/'
            filename = path + DID + '.docsent'
            #print filename
            if not fio.IsExist(filename): continue
            
            tree = ET.parse(filename)
            root = tree.getroot()
            
            for child in root:
                phrases.append(child.text)
            
            #read feature
            path = datadir + str(week)+ '/'
            path = path + type + '/'
            path = path + 'feature/'
            filename = path + type + '.LexRank.sentfeature'
            
            if fio.IsExist(filename):
                tree = ET.parse(filename)
                root = tree.getroot()
                
                for child in root:
                    feature = child[0]
                    #print feature.tag, feature.attrib, feature.attrib['V']
                    #print child.tag, child.attrib
                    scores.append(feature.attrib['V'])
            else:
                for phrase in phrases:
                    scores.append("0")
                
            #write
            assert(len(phrases) == len(scores))
            
            dict = {}
            for phrase, score in zip(phrases, scores):
                dict[phrase.lower()] = score
            
            output = outputdir + str(week)+ '/' + str(type) + "." + np + ".lexrank.dict"
            fio.NewPath(outputdir + str(week)+ '/')
            fio.SaveDict(dict, output, SortbyValueflag=True)
            
            dict = {}
            for phrase, score in zip(phrases, scores):
                if phrase.lower() in dict:
                    dict[phrase.lower()] = max(score, dict[phrase.lower()])
                else:
                    dict[phrase.lower()] = score
            
            output = outputdir + str(week)+ '/' + str(type) + "." + np + ".lexrankmax.dict"
            fio.SaveDict(dict, output, SortbyValueflag=True)
示例#20
0
def getShallowSummary(excelfile, folder, clusterdir, K=30, method=None, similarity=None, ratio=None, lex='lexrank'):
    #K is the number of words per points
    sheets = range(0,maxWeek)
    
    for i, sheet in enumerate(sheets):
        week = i + 1
        
        for type in ['q1', 'q2', 'q3', 'q4']:
            
            path = folder + str(week)+ '/'
            fio.NewPath(path)
            filename = path + type + '.%d.summary'%ratio
            
            #produce the cluster file on the fly
            phrasefile = os.path.join(clusterdir, str(week), type + '.' + method + '.key')
            if not fio.IsExist(phrasefile): continue
            
            print excelfile, sheet, type
            
            cluster_output = clusterdir + str(week) +'/' + type + ".cluster.kmedoids." + str(ratio) + "." +similarity + '.' + method
            print cluster_output
            
            weightfile = clusterdir + str(week)+ '/' + type + '.' + method + '.' + similarity
            print weightfile
            
            if not fio.IsExist(cluster_output):
            #if True:
                print "clustering"
                phraseClusteringKmedoid.getPhraseClusterPhrase(phrasefile, weightfile, cluster_output, ratio, method=method)
            if not fio.IsExist(cluster_output): continue
            body = fio.ReadMatrix(cluster_output, False)
            
            NPCandidates = fio.ReadFile(phrasefile)
            
            lexfile = clusterdir + str(week)+ '/' + str(type) + "." + method + "."+lex+".dict"
            lexdict = fio.LoadDict(lexfile, 'float')
            
            NPs = [row[0] for row in body]
            clusterids = [row[1] for row in body]
            
            #assert(NPCandidates == NPs)
            if NPCandidates != NPs: 
                print NPCandidates
                print NPs
            
            cluster = {}
            for row in body:
                cluster[row[0]] = int(row[1])
            
            Summary = []
            
            #sort the clusters according to the number of response
            keys = postProcess.RankClusterNoSource(NPs, lexdict, clusterids)
            
            total_word = 0
            word_count = 0
            for key in keys:
                #phrase = NPs[key]
                phrase = postProcess.getTopRankPhraseNoSource(NPs, clusterids, int(key), lexdict)
                if phrase in Summary: continue
                
                word_count = len(phrase.split())
                total_word = total_word + word_count
                #if total_word <= K:
                if len(Summary) + 1 <= K:
                    Summary.append(phrase)
                    
            fio.SaveList(Summary, filename)
示例#21
0
def extractPhraseFromAnnotation(phrasedir, annotator, summarydir=None):
    for doc, lec, annotator in annotation.generate_all_files(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotator,
            lectures=annotation.Lectures):
        print doc

        #load task
        task = annotation.Task()
        task.loadjson(doc)

        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        #Add a cache to make it faster
        Cache = {}
        cachefile = phrasedir + str(lec) + '/' + 'cache.json'
        if fio.IsExist(cachefile):
            with open(cachefile, 'r') as fin:
                Cache = json.load(fin)

        for prompt in ['q1', 'q2']:
            filename = path + prompt + '.' + method + '.key'
            cluster_output = path + prompt + '.cluster.kmedoids.sqrt.oracle.%s' % method

            if summarydir:
                fio.NewPath(os.path.join(summarydir, str(lec)))
                summary_file = os.path.join(summarydir, str(lec),
                                            '%s.summary' % prompt)

            body = []

            if summarydir:
                summaries = []

            phrase_summary_dict = task.get_phrase_summary_textdict(prompt)
            extracted_phrases = []
            phrase_annotation = task.get_phrase_annotation(prompt)
            for rank in sorted(phrase_annotation):
                rank_phrases = []
                phrases = phrase_annotation[rank]
                for phrasedict in phrases:
                    phrase = phrasedict['phrase'].lower()
                    extracted_phrases.append(phrase)
                    rank_phrases.append(phrase)
                    row = [phrase, rank]
                    body.append(row)

                if summarydir:
                    rank_summary = phrase_summary_dict[rank]
                    max_summary = get_max_phrase_by_ROUGE(
                        rank_summary, rank_phrases, Cache)
                    print max_summary

                    summaries.append(max_summary)

            fio.SaveList(extracted_phrases, filename)

            fio.WriteMatrix(cluster_output, body, header=None)

            if summarydir:
                fio.SaveList(summaries, summary_file)

            with open(cachefile, 'w') as outfile:
                json.dump(Cache, outfile, indent=2)
示例#22
0
if __name__ == '__main__':
    #     course = global_params.g_cid
    #     output = "../data/"+course + '/length.txt'
    #     compare_length(annotation.anotators[:1], output)
    #     exit(-1)

    course = sys.argv[1]
    maxWeek = int(sys.argv[2])
    system = sys.argv[3]
    method = sys.argv[4]

    sennadir = "../data/" + course + "/senna/"
    excelfile = "../data/CourseMIRROR/reflections.json"

    systemdir = "../data/" + course + "/" + system + "/"
    fio.NewPath(systemdir)

    phrasedir = "../data/" + course + "/" + system + "/phrase/"
    fio.NewPath(phrasedir)

    summarydir = "../data/" + course + "/" + system + "/ClusterARank/"
    if summarydir:
        fio.NewPath(summarydir)

#     output = "../data/"+course + '/statistics.txt'
#     extractStatistics(annotation.anotators[:1], output)
#     exit(-1)

    if method == 'syntax':
        extractPhrase(excelfile, phrasedir, sennadir, method=method)
    elif method == 'annotator1':
示例#23
0
def extractPhraseFromSyntax(extractiondir, annotators):
    for docs in annotation.generate_all_files_by_annotators(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotators,
            lectures=annotation.Lectures):

        doc0, lec0, annotator0 = docs[0]
        doc1, lec1, annotator1 = docs[1]

        assert (lec0 == lec1)
        lec = lec0

        #         if lec != 11: continue

        #load tasks
        task0 = annotation.Task()
        task0.loadjson(doc0)

        task1 = annotation.Task()
        task1.loadjson(doc1)

        path = extractiondir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            filename = path + prompt + '.feature.crf'
            print filename

            fout = codecs.open(filename, "w", "utf-8")

            extracted_phrases = []
            phrase_annotation0 = task0.get_phrase_annotation(prompt)
            phrase_annotation1 = task1.get_phrase_annotation(prompt)

            aligner = AlignPhraseAnnotation(task0, task1, prompt)
            aligner.align()

            for d in aligner.responses:
                tokens = [token.lower() for token in d['response']]
                tags = d['tags'][0]

                colors = d['colors']

                n_tokens = []
                n_tags = []

                for token, tag in zip(tokens, tags):
                    if len(token) == 0: continue

                    n_tokens.append(token)
                    n_tags.append(tag)

                if len(n_tokens) == 0: continue

                tokens = n_tokens
                tags = n_tags

                body = []

                words = tokens
                N = len(tokens)

                #first row: the word token
                for word in words:
                    row = []
                    row.append(word)
                    body.append(row)

                #add the color
                for color in colors:
                    for i, tag in enumerate(tags):
                        body[i].append(str(color[i]))

                #last row:
                tags = [tag for tag in tags]

                for i, tag in enumerate(tags):
                    body[i].append(tag)

                #extract the NP tags
                psg_tags = getSennaPSGtags(tokens)
                for i, tag in enumerate(psg_tags):
                    body[i].append(tag)

                for row in body:
                    fout.write(' '.join(row))
                    fout.write('\n')
                fout.write('\n')

            fout.close()

        if debug:
            break
示例#24
0
    #exit(-1)

    debug = False

    course = sys.argv[1]
    maxWeek = int(sys.argv[2])
    system = sys.argv[3]
    method = sys.argv[4]
    empty = sys.argv[5]

    excelfile = "../data/CourseMIRROR/reflections.json"

    extractVocab(annotation.anotators, '../data/%s/vocab.json' % course)

    extractiondir = "../data/" + course + "/" + system + "/extraction/"
    fio.NewPath(extractiondir)

    class_index_dict_file = '../data/%s/class_dict.json' % course

    if method == 'NP':
        extractPhraseFromSyntax(extractiondir, annotation.anotators)
        train_leave_one_lecture_out_NP('all')

    elif method == 'annotator1':
        extractPhraseFeatureFromAnnotation(extractiondir, annotation.anotators,
                                           0, empty)
    elif method == 'annotator2':
        extractPhraseFeatureFromAnnotation(extractiondir, annotation.anotators,
                                           1, empty)
    elif method == 'union':
        extractPhraseFeatureFromUnion(extractiondir, annotation.anotators,
示例#25
0
def extractPhraseFeatureFromCombine(extractiondir, annotators, empty='N'):
    for docs in annotation.generate_all_files_by_annotators(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotators,
            lectures=annotation.Lectures):

        doc0, lec0, annotator0 = docs[0]
        doc1, lec1, annotator1 = docs[1]

        assert (lec0 == lec1)
        lec = lec0

        #print lec

        #if lec != 17: continue

        #load tasks
        task0 = annotation.Task()
        task0.loadjson(doc0)

        task1 = annotation.Task()
        task1.loadjson(doc1)

        path = extractiondir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:

            #if prompt != 'q2': continue

            filename = path + prompt + '.feature.crf'
            print filename

            fout = codecs.open(filename, "w", "utf-8")

            extracted_phrases = []
            phrase_annotation0 = task0.get_phrase_annotation(prompt)
            phrase_annotation1 = task1.get_phrase_annotation(prompt)

            aligner = AlignPhraseAnnotation(task0, task1, prompt)
            aligner.align()

            crf_feature_extractor = CRF_Extractor()

            #add sentences to the extrator for global feature extraction
            for d in aligner.responses:
                tokens = [token.lower() for token in d['response']]
                colors = d['colors']

                if d['tags'][0] == d['tags'][1]:
                    combinetags = [d['tags'][0]]
                else:
                    combinetags = [d['tags'][0], d['tags'][1]]

                for tags in combinetags:
                    n_tokens = []
                    n_tags = []

                    for token, tag in zip(tokens, tags):
                        if len(token) == 0: continue

                        n_tokens.append(token)
                        n_tags.append(tag)

                    if len(n_tokens) == 0: continue

                    tokens = n_tokens
                    tags = n_tags

                    crf_feature_extractor.add_sentence((tokens, tags, colors))

            for tokens, tags, colors in crf_feature_extractor.sentences:
                if empty == 'Y':
                    flag = True
                    for tag in tags:
                        if tag != 'O': flag = False
                    if flag: continue

                body = crf_feature_extractor.extract_crf_features(
                    tokens, tags, prompt, colors)

                for row in body:
                    fout.write(' '.join(row))
                    fout.write('\n')
                fout.write('\n')

            fout.close()
示例#26
0
def train_leave_one_lecture_out(name='cv'):
    wapiti_home = global_params.wapiti_dir

    pattern_file = '../data/%s.pattern.txt' % name
    model_dir = '../data/%s/%s/model/%s/' % (course, system, name)
    fio.NewPath(model_dir)

    feature_dir = '../data/%s/%s/extraction/' % (course, system)
    feature_cv_dir = '../data/%s/%s/extraction/%s/' % (course, system, name)
    fio.NewPath(feature_cv_dir)

    outputdir = '../data/%s/%s/extraction/%s_output/' % (course, system, name)
    fio.NewPath(outputdir)

    lectures = annotation.Lectures

    dict = defaultdict(int)

    for i, lec in enumerate(lectures):
        train = [x for x in lectures if x != lec]
        test = [lec]

        train_filename = os.path.join(feature_cv_dir,
                                      'train_%d.feature.crf' % i)

        model_file = os.path.join(model_dir, '%d.model' % i)

        print train_filename
        print model_file

        crf = CRF(wapiti_home)
        if not fio.IsExist(model_file):
            #if True:
            combine_files(feature_dir, train, train_filename)
            crf.train(train_filename, pattern_file, model_file)

        for q in ['q1', 'q2']:

            test_filename = os.path.join(feature_cv_dir,
                                         'test_%d_%s.feature.crf' % (i, q))
            output_file = os.path.join(outputdir, 'test_%d_%s.out' % (i, q))

            dict['test_%d_%s' % (i, q)] = 1

            if empty == 'Y':
                test_filename_old = test_filename.replace('_Y', '_N')
                cmd = 'cp %s %s' % (test_filename_old, test_filename)
                os.system(cmd)
            else:

                if method == 'combine':
                    test_filename_old = test_filename.replace(
                        '_combine', '_A1')
                    cmd = 'cp %s %s' % (test_filename_old, test_filename)
                    os.system(cmd)
                else:
                    combine_files(feature_dir,
                                  test,
                                  test_filename,
                                  prompts=[q])

            crf.predict(test_filename, model_file, output_file)

        if debug: break

    file_util.save_dict2json(dict, class_index_dict_file)