예제 #1
0
def extractVocab(annotators, output):
    vocab = defaultdict(int)
    for docs in annotation.generate_all_files_by_annotators(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotators,
            lectures=annotation.Lectures):

        doc0, lec0, annotator0 = docs[0]
        doc1, lec1, annotator1 = docs[1]

        assert (lec0 == lec1)
        lec = lec0

        #load tasks
        task0 = annotation.Task()
        task0.loadjson(doc0)

        task1 = annotation.Task()
        task1.loadjson(doc1)

        for prompt in ['q1', 'q2']:
            phrase_annotation0 = task0.get_phrase_annotation(prompt)
            phrase_annotation1 = task1.get_phrase_annotation(prompt)

            aligner = AlignPhraseAnnotation(task0, task1, prompt)
            aligner.align()

            #add sentences to the extrator for global feature extraction
            for d in aligner.responses:
                tokens = [token.lower() for token in d['response']]

                for token in tokens:
                    vocab[token] += 1
    fio.SaveDict2Json(vocab, output)
예제 #2
0
def extractPhraseFromCRFWithColor(phrasedir, systemdir):
    crf_reader = CRF()
    aligner = AlignPhraseAnnotation()

    lectures = annotation.Lectures
    for i, lec in enumerate(lectures):
        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            filename = path + prompt + '.' + method + '.key'
            extracted_phrases = []
            extracted_colors = []

            crf_file = os.path.join(systemdir, 'extraction', 'all_output',
                                    'test_%i_%s.out' % (i, prompt))
            for tokens, tags, color0, color1 in crf_reader.read_file_generator_index(
                    crf_file, [0, -1, -4, -3]):
                phrases, phrase_colors = aligner.get_phrase_with_colors(
                    tokens, tags, [color0, color1])

                for phrase, phrase_color in zip(phrases, phrase_colors):

                    extracted_phrases.append(phrase.lower())
                    extracted_colors.append(phrase_color)

            fio.SaveList(extracted_phrases, filename)

            filename = path + prompt + '.' + method + '.key.color'
            fio.SaveDict2Json(extracted_colors, filename)
예제 #3
0
def extractPhrasePaireFeature(phrasedir):
    for lec in annotation.Lectures:
        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            prefix = os.path.join(path, '%s.%s.' % (prompt, method))
            filename = path + prompt + sim_exe
            print filename

            featureset = []

            feature_extractor = Similarity(prefix)

            phrasefile = os.path.join(path, "%s.%s.key" % (prompt, method))

            phrases = fio.LoadList(phrasefile)

            for p1 in phrases:
                for p2 in phrases:
                    featureset.append(
                        (feature_extractor.get_features(p1, p2), 0.0, {
                            'p1': p1,
                            'p2': p2
                        }))

            fio.SaveDict2Json(featureset, filename)

            feature_extractor.save()
예제 #4
0
def extractPhrasePaireFromAnnotation(phrasedir, annotators, id):
    for doc, lec, annotator in annotation.generate_all_files(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotators,
            lectures=annotation.Lectures):
        print doc

        #load task
        task = annotation.Task()
        task.loadjson(doc)

        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            prefix = os.path.join(path, '%s.%s.' % (prompt, method))
            filename = path + prompt + sim_exe
            print filename

            featureset = []

            feature_extractor = Similarity(prefix)

            phrase_annotation = task.get_phrase_annotation(prompt)

            #positive examples
            for rank1 in sorted(phrase_annotation):
                for rank2 in sorted(phrase_annotation):
                    if rank1 == rank2:
                        score = 1.0
                    else:
                        score = 0.0

                    phrases1 = phrase_annotation[rank1]
                    phrases2 = phrase_annotation[rank2]
                    for phrasedict1 in phrases1:
                        p1 = phrasedict1['phrase'].lower().strip()

                        for phrasedict2 in phrases2:
                            p2 = phrasedict2['phrase'].lower().strip()

                            featureset.append(
                                (feature_extractor.get_features(p1,
                                                                p2), score, {
                                                                    'p1': p1,
                                                                    'p2': p2
                                                                }))

            fio.SaveDict2Json(featureset, filename)

            feature_extractor.save()
예제 #5
0
def PrepareIE256():
    cid = "IE256"
    maxWeek = 25

    excelfile = "../data/CourseMirror/Reflection.json"
    sennadir = "../../AbstractPhraseSummarization/data/IE256/senna/"

    #fio.NewPath(sennadir)
    #getStudentResponses4Senna(excelfile, cid, maxWeek, sennadir)

    outdirs = [  #'../../AbstractPhraseSummarization/data/IE256/ILP_Baseline_Sentence/',
        #'../../AbstractPhraseSummarization/data/IE256/MC/',
        #'../../AbstractPhraseSummarization/data/IE256/ILP_Sentence_MC/',
        '../../AbstractPhraseSummarization/data/IE256/ILP_Sentence_Supervised_FeatureWeightingAveragePerceptron/',
    ]

    sheets = range(1, maxWeek + 1)

    for outdir in outdirs:
        for sheet in sheets:
            week = sheet

            for type in ['q1', 'q2', 'q3', 'q4']:
                student_summaryList = getStudentResponseList(
                    excelfile, cid, week, type, True)
                if len(student_summaryList) == 0: continue

                path = os.path.join(outdir, str(week))
                fio.NewPath(path)

                source = {}
                responses = []
                count = defaultdict(int)
                for response, student in student_summaryList:
                    responses.append(response)
                    count[response] += 1

                    if response not in source:
                        source[response] = []
                    source[response].append(student)

                outout = os.path.join(path, type + ".sentence.key")
                fio.SaveList(set(responses), outout)

                output = os.path.join(path, type + '.sentence.keys.source')
                fio.SaveDict2Json(source, output)

                output = os.path.join(path, type + '.sentence.dict')
                fio.SaveDict(count, output)
예제 #6
0
def get_phrase_reference_summary_phrase_no(outputs = None):
    
    Numbers = []
    
    counts = []
    for doc, lec, annotator in annotation.generate_all_files(annotation.datadir + 'json/', '.json', anotators = annotation.anotators[:1], lectures=annotation.Lectures):
        print doc
        
        task = annotation.Task()
        task.loadjson(doc)
        
        sub_tasks = task.get_tasks()
        
        for sub_task in sub_tasks:
            if sub_task["task_name"] == "Phrase":
                if sub_task['prompt'] == 0: #POI
                    type = 'q1'
                else: 
                    type = 'q2'
                
                student_numbers = [row[2].strip() for row in sub_task["summary"][1:]]
                Numbers += [int(x) for x in student_numbers]
                    
    fio.SaveDict2Json(Numbers, '../data/%s_supporters.txt'%global_params.g_cid)
예제 #7
0
def loadmodel(modelbin, vocabjson, output):
    vocab = fio.LoadDictJson(vocabjson)

    word_vecs = load_bin_vec(modelbin, vocab)

    fio.SaveDict2Json(word_vecs, output)
예제 #8
0
def getRouge(datadir, maxWeek, output):
    print datadir

    sheets = range(0, maxWeek)

    body = []

    for sheet in sheets:
        week = sheet + 1
        dir = datadir + str(week) + '/'

        for type in ['q1', 'q2']:
            summary_file = dir + type + "." + 'summary'
            print summary_file

            if not fio.IsExist(summary_file):
                print summary_file
                continue

            Cache = {}
            cachefile = os.path.join(datadir, str(week), 'cache.json')
            print cachefile
            if fio.IsExist(cachefile):
                with open(cachefile, 'r') as fin:
                    Cache = json.load(fin)

            #read TA's summmary
            refs = []
            for i in range(2):
                reffile = os.path.join(datadir, str(week),
                                       type + '.ref.%d' % i)
                if not fio.IsExist(reffile):
                    print reffile
                    continue

                lines = fio.ReadFile(reffile)
                ref = [line.strip() for line in lines]
                refs.append(ref)

            if len(refs) == 0: continue

            lstref = refs[0] + refs[1]

            lines = fio.ReadFile(summary_file)
            TmpSum = [line.strip() for line in lines]

            cacheKey = OracleExperiment.getKey(lstref, TmpSum)
            if cacheKey in Cache:
                scores = Cache[cacheKey]
                print "Hit"
            else:
                print "Miss"
                print summary_file
                scores = OracleExperiment.getRouge_IE256(refs, TmpSum)
                Cache[cacheKey] = scores

            row = [week]
            row = row + scores

            body.append(row)

            try:
                fio.SaveDict2Json(Cache, cachefile)
            except Exception as e:
                #fio.SaveDict(Cache, cachefile + '.dict')
                print e

    header = ['id'] + RougeHeader
    row = ['ave']
    for i in range(1, len(header)):
        scores = [float(xx[i]) for xx in body]
        row.append(numpy.mean(scores))
    body.append(row)

    fio.WriteMatrix(output, body, header)
예제 #9
0
def getRouge(datadir, maxWeek, output):
    sheets = range(0, maxWeek)

    body = []
    allbody = []

    #Krange = range(1, 25)
    #Krange = range(1, 25)
    Krange = [gK]

    for sheet in sheets:
        week = sheet + 1
        dir = datadir + str(week) + '/'

        for type in ['q1', 'q2']:

            maxS = 0
            maxK = -1
            maxScore = []

            Cache = {}
            cachefile = os.path.join(datadir, str(week), 'cache.json')
            print cachefile
            if fio.IsExist(cachefile):
                with open(cachefile, 'r') as fin:
                    Cache = json.load(fin)

            allrow = [week]

            #Krange = [np.random.randint(1, 25)]

            for K in Krange:

                summary_file = dir + type + '.%d.summary' % K

                print summary_file

                if not fio.IsExist(summary_file):
                    print summary_file
                    continue

                #read TA's summmary
                refs = []
                for i in range(2):
                    reffile = os.path.join(datadir, str(week),
                                           type + '.ref.%d' % i)
                    if not fio.IsExist(reffile):
                        print reffile
                        continue

                    lines = fio.ReadFile(reffile)
                    ref = [line.strip() for line in lines]
                    refs.append(ref)

                if len(refs) == 0: continue

                lstref = refs[0] + refs[1]

                lines = fio.ReadFile(summary_file)
                TmpSum = [line.strip() for line in lines]

                cacheKey = OracleExperiment.getKey(lstref, TmpSum)
                if cacheKey in Cache:
                    scores = Cache[cacheKey]
                    print "Hit"
                else:
                    print "Miss"
                    print summary_file
                    scores = OracleExperiment.getRouge_IE256(refs, TmpSum)
                    Cache[cacheKey] = scores

                s = float(scores[RIndex])

                allrow.append(s)

                if s >= maxS:
                    maxS = s
                    maxScore = scores
                    maxK = K

            if maxK == -1: continue

            row = [week]
            row = row + maxScore + [maxK]

            body.append(row)

            allrow.append(maxK)

            allbody.append(allrow)

            try:
                fio.SaveDict2Json(Cache, cachefile)
            except:
                #fio.SaveDict(Cache, cachefile + '.dict')
                pass

    header = ['id'] + RougeHeader
    row = ['ave']
    for i in range(1, len(header)):
        scores = [float(xx[i]) for xx in body]
        row.append(numpy.mean(scores))
    body.append(row)

    fio.WriteMatrix(output, body, header)

    fio.WriteMatrix(output + '.all', allbody, ['week'] + Krange)
예제 #10
0
 def save(self):
     fio.SaveDict2Json(self.Cache, self.cachefile)