예제 #1
0
def plot_reference_summary_no_distribution():
    import collections
    C = {}
        
    M = 0
    for cid in ['Engineer', 'IE256', 'IE256_2016', 'CS0445']:
        support_file = '../data/%s_supporters.txt'%cid
        supports = fio.LoadDictJson(support_file)
        
        M = max(M, max(supports))
        C[cid] = collections.Counter(supports)
    
    for cid in ['Engineer', 'IE256', 'IE256_2016', 'CS0445']:
        del C[cid][0]
        
    A = {}
    
    for i in range(1, M+1):
        for cid in ['Engineer', 'IE256', 'IE256_2016', 'CS0445']:
            if cid not in A:
                A[cid] = collections.defaultdict(float)
                
            r = C[cid][i]*1.0/sum(C[cid].values()) if i in C[cid] else 0
            A[cid][i] += r + A[cid][i-1]
    
    for i in range(1, M+1):
        for cid in ['Engineer', 'IE256', 'IE256_2016', 'CS0445']:
            print A[cid][i], '\t',
        print
예제 #2
0
def combine_files(lectures, features=None, prompts=['q1', 'q2']):
    phrasedir1 = '../data/%s/oracle_annotator_1/phrase/' % course
    phrasedir2 = '../data/%s/oracle_annotator_2/phrase/' % course

    X = []
    Y = []

    if features == None:
        sim_extractor = Similarity()
        features = sorted(sim_extractor.features.keys())

    for i, lec in enumerate(lectures):
        for q in prompts:

            for phrasedir in [phrasedir1, phrasedir2]:
                path = phrasedir + str(lec) + '/'

                filename = os.path.join(path, q + sim_exe)

                data = fio.LoadDictJson(filename)

                for fdict, score, _ in data:
                    row = []

                    for name in features:
                        x = fdict[name]
                        if str(x) == 'nan':
                            x = 0.0
                        row.append(x)

                    X.append(row)
                    Y.append(score)

    return X, Y
예제 #3
0
def PrintClusterRankSummary(datadir):
    sheets = range(0,maxWeek)
    
    lectures = fio.LoadDictJson('../data/CourseMIRROR/lectures.json')
    
    head = ['week', 'data', 'Point of Interest', "Muddiest Point"]
    body = []
    
    for i, sheet in enumerate(sheets):        
        row = []
        week = i + 1
        
        row.append(week)
        row.append(getDate(lectures, course, week))
        
        for type in ['q1', 'q2', 'q3', 'q4']:
            path = datadir + str(i+1)+ '/'
            summaryfile = path + type + '.summary'
            if not fio.IsExist(summaryfile): continue
            
            summaries = [line.strip() for line in fio.ReadFile(summaryfile)]
            
            sourcefile = path + type + '.summary.source'
            sources = [line.split(',') for line in fio.ReadFile(sourcefile)]
            
            combinedSummary = []
            for j, (summary, source) in enumerate(zip(summaries, sources)):
                summary = summary.replace('"', '\'')
                combinedSummary.append(str(j+1) + ") " + summary + " [" + str(len(source)) + "]")
            
            row.append('"' + chr(10).join(combinedSummary)+ '"') 
        
        body.append(row)
    fio.WriteMatrix(datadir + "summary.txt", body, head)
예제 #4
0
def correlation_analysis(course):
    phrasedir1 = '../data/%s/oracle_annotator_1/phrase/' % course
    phrasedir2 = '../data/%s/oracle_annotator_2/phrase/' % course

    outdir = '../data/%s/simlearning/' % course
    fio.NewPath(outdir)

    sim_extractor = Similarity()

    features = sorted(sim_extractor.features.keys())
    head = features + ['score', 'predict']
    body = []
    lectures = annotation.Lectures
    name = '_'.join(features)

    for i, lec in enumerate(lectures):

        model_file = os.path.join(model_dir, '%d_%s.model' % (lec, name))

        with open(model_file, 'rb') as handle:
            clf = pickle.load(handle)

        for q in ['q1', 'q2']:

            outfile = os.path.join(outdir, str(lec), '%s%s' % (q, sim_exe))

            for phrasedir in [phrasedir1, phrasedir2]:
                path = phrasedir + str(lec) + '/'

                filename = os.path.join(path, q + sim_exe)

                data = fio.LoadDictJson(filename)

                for fdict, score, _ in data:
                    row = []

                    for fname in features:
                        x = fdict[fname]

                        if str(x) == 'nan':
                            x = 0.0

                        row.append(x)

                    predict_score = clf.predict([row])

                    row.append(score)

                    row.append(predict_score[0])

                    body.append(row)

    out_correlation = os.path.join(outdir, 'data.txt')

    print out_correlation
    fio.WriteMatrix(out_correlation, body, head)
예제 #5
0
def check_stopword():
    from CourseMirror_Survey import stopwords

    vocab = fio.LoadDictJson(global_params.vocab)

    for word, count in vocab.items():
        if count < 5: continue

        if word in stopwords:
            print word, '\t', count
예제 #6
0
def correlation_analysis_noduplicate():
    phrasedir1 = '../data/%s/oracle_annotator_1/phrase/' % course
    phrasedir2 = '../data/%s/oracle_annotator_2/phrase/' % course

    outdir = '../data/%s/simlearning/' % course
    fio.NewPath(outdir)

    sim_extractor = Similarity()

    features = sorted(sim_extractor.features.keys())
    head = features + ['score']
    body = []
    lectures = annotation.Lectures

    for i, lec in enumerate(lectures):
        for q in ['q1', 'q2']:

            outfile = os.path.join(outdir, str(lec), '%s%s' % (q, sim_exe))

            for phrasedir in [phrasedir1, phrasedir2]:
                path = phrasedir + str(lec) + '/'

                filename = os.path.join(path, q + sim_exe)

                data = fio.LoadDictJson(filename)

                for fdict, score, pd in data:
                    if pd['p1'] == pd['p2']:
                        print pd['p1']
                        continue

                    row = []

                    for name in features:
                        x = fdict[name]

                        if str(x) == 'nan':
                            x = 0.0

                        row.append(x)
                    row.append(score)

                    body.append(row)

    out_correlation = os.path.join(outdir, 'data.txt')
    fio.WriteMatrix(out_correlation, body, head)
예제 #7
0
 def __init__(self, key_prefix, sum_prefix, N):
     '''
     N is number of annotators
     '''
     
     self.key_prefix = key_prefix
     self.sum_prefix = sum_prefix
     self.N = N
     
     #load phrase color map
     phrasefile = key_prefix + phrase_exe
     phrases = fio.LoadList(phrasefile)
     
     colorfile = key_prefix + color_exe
     color_map = fio.LoadDictJson(colorfile)
     
     phrase_color_map = self.combine_phrase_color(phrases, color_map)
     
     #get phrase summary color map
     sumfile = sum_prefix + sum_exe
     summaries = fio.LoadList(sumfile)
     
     self.summary_color = self.get_summary_color(summaries, phrase_color_map)
     
     #get summary count
     sumcountfile =  sum_prefix + sum_count_exe
     self.summary_no = [int(x) for x in fio.LoadList(sumcountfile)]
     
     assert(len(self.summary_color) == len(self.summary_no))
     
     #load human_summary color map
     self.ref_color = []
     
     for i in range(N):
         d = {}
         ref_sumcolor_file = '%s%s.%d.color'%(sum_prefix, ref_exe, i)
         ref_sumno_file = '%s%s.%d.no'%(sum_prefix, ref_exe, i)
         
         for color, no in zip(fio.LoadList(ref_sumcolor_file), fio.LoadList(ref_sumno_file)):
             d[int(color)] = int(no)
         
         self.ref_color.append(d)
예제 #8
0
    def __init__(self, prefix=""):
        self.features = {
            'optimumComparerLSATasa': self.LSA,
            'LexicalOverlap': self.LexicalOverlap,
            'optimumComparerWNLin': self.LIN,
            'BLEU': self.BLEU,
            'ROUGE': self.ROUGE,
            'Cosine': self.Cosine,
            'WordEmbedding': self.WordEmbedding,
            #'WMD': self.WMD,
        }

        self.prefix = prefix

        self.Cache = {}
        self.cachefile = os.path.join(prefix + 'cache.json')
        print self.cachefile
        if fio.IsExist(self.cachefile):
            with open(self.cachefile, 'r') as fin:
                self.Cache = json.load(fin)

        if self.prefix != '':
            self.matrixdict = {}
            for sim in [
                    'optimumComparerLSATasa', 'LexicalOverlap',
                    'optimumComparerWNLin', 'BLEU'
            ]:
                self.matrixdict[sim] = {}

                filename = self.prefix + sim

                phrases, matrix = fio.ReadMatrix(filename, hasHead=True)

                index = {}
                for i, p in enumerate(phrases):
                    index[p] = i

                self.matrixdict[sim]['index'] = index
                self.matrixdict[sim]['matrix'] = matrix

        self.word2vec = fio.LoadDictJson(global_params.word2vec_model)
예제 #9
0
def loadmodel(modelbin, vocabjson, output):
    vocab = fio.LoadDictJson(vocabjson)

    word_vecs = load_bin_vec(modelbin, vocab)

    fio.SaveDict2Json(word_vecs, output)
예제 #10
0
    def run(self, cid, summarylastlecture=False):
        max_lecture = 30

        #         max_lecture = self.get_max_lecture_num(cid)
        #         print "max_lecture", max_lecture

        #         #get reflections
        #         reflections = self.get_reflections(cid)
        jsonfile = '../data/CourseMIRROR/reflections.json'
        #         with open(jsonfile, 'w') as outfile:
        #             json.dump(reflections, outfile, encoding='utf-8', indent=2)
        #

        reflections = fio.LoadDictJson(jsonfile)

        #get lectures
        lectures = self.get_lectures(cid)
        jsonfile = '../data/CourseMIRROR/lectures.json'
        with open(jsonfile, 'w') as outfile:
            json.dump(lectures, outfile, encoding='utf-8', indent=2)

        self.N = len(reflections['results'])
        print "total number of reflections:", self.N

        if self.N == self.old_N:  #no need to summary
            return

        self.old_N = self.N

        #run senna
        #         os.system('python CourseMirror_Survey.py ' + str(cid) + ' ' +  str(max_lecture))
        #
        #         cmd = 'cmd /C "runSennaCourseMirror.bat '+str(cid)+ ' ' + str(max_lecture) + '"'
        #         os.system(cmd)
        #
        #
        #         cmd = 'python QPS_extraction.py %s %d %s %s %s'%(cid, max_lecture, self.system, str(self.method), 'N')
        #         os.system(cmd)

        #         cmd = 'python QPS_prepare.py ' + str(cid) + ' ' +  str(max_lecture) + ' ' + str(self.system) + ' ' + str(self.method)
        #         os.system(cmd)
        #
        #       #. get PhraseMead input (CourseMirror_MeadPhrase.py)
        #         cmd = 'python CourseMirror_MeadPhrase.py ' + str(cid) + ' ' +  str(max_lecture) + ' ' + str(self.system) + ' ' + str(self.method)
        #         print cmd
        #         os.system(cmd)
        # #
        #         olddir = os.path.dirname(os.path.realpath(__file__))
        #
        #         #     . get PhraseMead output
        #         meaddir = global_params.meaddir
        #         cmd = './get_mead_summary_phrase_qps.sh ' + str(cid) + ' ' +  str(max_lecture) + ' ' + str(self.system)
        #         os.chdir(meaddir)
        #         retcode = subprocess.call([cmd], shell=True)
        #         print retcode
        #         subprocess.call("exit 1", shell=True)
        #         os.chdir(olddir)
        #
        #         #     . get LSA results (CourseMirrorphrase2phraseSimilarity.java)
        #         #cmd = 'cmd /C "runLSA.bat '+str(cid)+ ' ' + str(max_lecture) + ' ' + str(self.system) + ' ' + str(self.method) + '"'
        #         cmd = 'cmd /C "runLSA_All.bat '+str(cid)+ ' ' + str(max_lecture) + ' ' + str(self.system) + ' ' + str(self.method) + '"'
        #         os.system(cmd)

        #get community dection results

        # get ClusterARank (CourseMirror_phraseClusteringbasedShallowSummaryKmedoid-New-Malformed-LexRank.py)
        cmd = "python CourseMirror_ClusterARank.py %s %d %s %s %s" % (
            cid, max_lecture, self.system, self.method, self.similarity)
        print cmd
        os.system(cmd)

        cmd = "python get_summary.py %s %s" % (cid, self.system)
        print cmd
        os.system(cmd)

        cmd = "python get_Rouge.py %s %d %s %s" % (
            cid, max_lecture, self.system, self.method + '_' + self.similarity)
        print cmd
        os.system(cmd)

        cmd = "python eval_student_number.py %s %d %s %s %s" % (
            cid, max_lecture, self.system, self.method, self.similarity)
        print cmd
        os.system(cmd)