Exemplo n.º 1
0
def loadDevDocs():
    #read dev_truth docs from Mongo
    dev_docs_en = cpr.loadDocuments({
        'dev_truth': {
            '$exists': 1
        },
        'origin': {
            '$ne': 'linkedin'
        },
        'language': 'en'
    })
    dev_docs_en += cpr.loadDocuments(
        {
            'dev_truth': {
                '$exists': 1
            },
            'origin': 'linkedin',
            'language': 'en'
        }, cpr.LinkedInProfile)
    dev_docs_nl = cpr.loadDocuments({
        'dev_truth': {
            '$exists': 1
        },
        'language': 'nl'
    })

    print "EN:", [doc._id for doc in dev_docs_en]
    print "NL:", [doc._id for doc in dev_docs_nl]

    return dev_docs_en, dev_docs_nl
Exemplo n.º 2
0
def loadDevDocs():
    #read dev_truth docs from Mongo
    dev_docs_en = cpr.loadDocuments(
        {'dev_truth': {'$exists': 1}, 'origin': {'$ne': 'linkedin'},
         'language': 'en'})
    dev_docs_en += cpr.loadDocuments(
        {'dev_truth': {'$exists': 1}, 'origin': 'linkedin',
         'language': 'en'}, cpr.LinkedInProfile)
    dev_docs_nl = cpr.loadDocuments(
        {'dev_truth': {'$exists': 1}, 'language': 'nl'})

    print "EN:", [doc._id for doc in dev_docs_en]
    print "NL:", [doc._id for doc in dev_docs_nl]

    return dev_docs_en, dev_docs_nl
Exemplo n.º 3
0
def judgeWebsites():
    # Load website Documents from Mongo
    ws_docs = cpr.loadDocuments({'origin': 'website'})
    # Make statements for all ws_docs
    for doc in ws_docs:
        if hasattr(doc, 'dev_truth'):
            print "Not making statements for %s; already has dev_truth" % doc.title
        else:
            doc.dev_truth = {}
            doc.makeStatements()
    # Select two docs with most statements
    two_most = heapq.nlargest(2, ws_docs, key=countStatements)
    # Remove dev_truth for non-selected docs
    for doc in ws_docs:
        if doc not in two_most:
            print "Delete dev_truth for %s" % doc.title
            del doc.dev_truth
            doc.toMongo()
    # Judge selected docs
    for sel_doc in two_most:
        if countStatements(sel_doc, key='extracted') > 0:
            rm_count = interactiveGroundTruth(sel_doc)
            if rm_count < 1:
                print(
                    "No statements removed: default behavior DELETE.\n"
                    "Press Enter or Space to KEEP dev_truth,"
                    " or another key to delete ...")
                delete = getch()
                if delete in (" ", "\r"):
                    print "**Dev_truth KEPT**\n"
                else:
                    del sel_doc.dev_truth
                    print "**Dev_truth DELETED**\n"
        else:
            print "Max statements is 0; Check documents!"
Exemplo n.º 4
0
def judgeCourseDescriptions():
    # Which courses were taken by many students?
    c_dict, top = countCourseAttendance()
    (top_nl, top_en) = [], []
    for c_id, count in top:
        doc = cpr.loadDocuments({'_id': c_id})[0]
        if doc.language == 'en':
            top_en.append(doc)
        elif doc.language == 'nl':
            top_nl.append(doc)
        if min(len(top_en), len(top_nl)) >= 3:
            break  # 3 top docs per language is enough
    # Make statements and judge selected docs
    for sel_doc in (top_nl[0], top_nl[0]):
        print "\n\n", sel_doc._id
        if hasattr(sel_doc, 'dev_truth'):
            print "Overwrite dev_truth for %s? YES or NO" % sel_doc._id
            overwrite = raw_input('Overwrite?-> ')
            if overwrite == "YES": pass
            else:
                print sel_doc._id, "skipped"
                continue

        sel_doc.dev_truth = {}
        sel_doc.makeStatements("dev_truth")  # should work, but verify
        interactiveGroundTruth(sel_doc)
Exemplo n.º 5
0
def judgeWebsites():
    # Load website Documents from Mongo
    ws_docs = cpr.loadDocuments({'origin': 'website'})
    # Make statements for all ws_docs
    for doc in ws_docs:
        if hasattr(doc, 'dev_truth'):
            print "Not making statements for %s; already has dev_truth" % doc.title
        else:
            doc.dev_truth = {}
            doc.makeStatements()
    # Select two docs with most statements
    two_most = heapq.nlargest(2, ws_docs, key=countStatements)
    # Remove dev_truth for non-selected docs
    for doc in ws_docs:
        if doc not in two_most:
            print "Delete dev_truth for %s" % doc.title
            del doc.dev_truth
            doc.toMongo()
    # Judge selected docs
    for sel_doc in two_most:
        if countStatements(sel_doc, key='extracted') > 0:
            rm_count = interactiveGroundTruth(sel_doc)
            if rm_count < 1:
                print("No statements removed: default behavior DELETE.\n"
                      "Press Enter or Space to KEEP dev_truth,"
                      " or another key to delete ...")
                delete = getch()
                if delete in (" ", "\r"):
                    print "**Dev_truth KEPT**\n"
                else:
                    del sel_doc.dev_truth
                    print "**Dev_truth DELETED**\n"
        else: print "Max statements is 0; Check documents!"
Exemplo n.º 6
0
def judgeCourseDescriptions():
    # Which courses were taken by many students?
    c_dict, top = countCourseAttendance()
    (top_nl, top_en) = [], []
    for c_id, count in top:
        doc = cpr.loadDocuments({'_id': c_id})[0]
        if doc.language == 'en':
            top_en.append(doc)
        elif doc.language == 'nl':
            top_nl.append(doc)
        if min(len(top_en), len(top_nl)) >= 3:
            break # 3 top docs per language is enough
    # Make statements and judge selected docs
    for sel_doc in (top_nl[0], top_nl[0]):
        print "\n\n", sel_doc._id
        if hasattr(sel_doc, 'dev_truth'):
            print "Overwrite dev_truth for %s? YES or NO" % sel_doc._id
            overwrite = raw_input('Overwrite?-> ')
            if overwrite == "YES": pass
            else:
                print sel_doc._id, "skipped"
                continue
        
        sel_doc.dev_truth = {}
        sel_doc.makeStatements("dev_truth") # should work, but verify
        interactiveGroundTruth(sel_doc)
Exemplo n.º 7
0
def judgeLinkedIn():
    # Load LinkedIn Documents from Mongo
    li_docs = cpr.loadDocuments({'origin': 'linkedin'}, cpr.LinkedInProfile)
    # Select the two docs with most extracted statements and judge them
    two_most = heapq.nlargest(2, li_docs, key=countStatements)
    for sel_doc in two_most:
        if countStatements(sel_doc, key='extracted') > 0:
            interactiveGroundTruth(sel_doc)
        else: print "Max statements is 0; Check documents!"
    # Remove 'dev_truth' from the non-judged documents (prompt to be sure)
    for doc in li_docs:
        if doc not in two_most:
            print "Delete dev_truth for %s? YES or NO" % doc.title
            delete = raw_input('Delete?-> ')
            if delete == "YES":
                del doc.dev_truth
                doc.toMongo()
            else: print doc.title, "skipped"
Exemplo n.º 8
0
def judgeLinkedIn():
    # Load LinkedIn Documents from Mongo
    li_docs = cpr.loadDocuments({'origin': 'linkedin'}, cpr.LinkedInProfile)
    # Select the two docs with most extracted statements and judge them
    two_most = heapq.nlargest(2, li_docs, key=countStatements)
    for sel_doc in two_most:
        if countStatements(sel_doc, key='extracted') > 0:
            interactiveGroundTruth(sel_doc)
        else:
            print "Max statements is 0; Check documents!"
    # Remove 'dev_truth' from the non-judged documents (prompt to be sure)
    for doc in li_docs:
        if doc not in two_most:
            print "Delete dev_truth for %s? YES or NO" % doc.title
            delete = raw_input('Delete?-> ')
            if delete == "YES":
                del doc.dev_truth
                doc.toMongo()
            else:
                print doc.title, "skipped"
Exemplo n.º 9
0
def judgeShareworksPortfolio():
    # Load Shareworks Documents from Mongo
    posts = cpr.loadDocuments({'doctype': 'posts'})
    reports = cpr.loadDocuments({'doctype': 'report'})
    slides = cpr.loadDocuments({'doctype': 'slides'})
    # Make statements for all sw_docs
    has_dev_truth_count = 0
    overwrite = False
    while True:  # this loop is a strange hack and needs refactoring
        print "\nMaking statements for sw_docs (overwrite=%s)" % overwrite
        if has_dev_truth_count < 0:
            has_dev_truth_count = -67
        for doc in (posts + reports + slides):
            # threshold for overwriting dev_truths
            if has_dev_truth_count > 10:
                print "\nAssuming docs are not judged; overwrite everything"
                overwrite = True
                has_dev_truth_count = -1
                break
            if hasattr(doc, 'dev_truth') and not overwrite:
                has_dev_truth_count += 1
                print "!!%s already has a dev_truth; skipping" % doc._id
                continue  # don't overwrite; might be judged already
            doc.dev_truth = {}
            doc.makeStatements()
        if not overwrite or has_dev_truth_count < -10:
            print "\nDone making statements!"
            break
    # Select the post closest to 15th percentile # of extracted statements
    post_counts = map(countStatements, posts)
    perc_15 = 1 / numpy.percentile(post_counts, 15.0)
    sel_post = min(posts, key=lambda d: abs(countStatements(d) * perc_15 - 1))
    print "\nPost %s selected (%i statements)" % (sel_post._id,
                                                  countStatements(sel_post))
    # Select the report closest to median # of extracted statements
    report_counts = map(countStatements, reports)
    perc_50 = 1 / numpy.percentile(report_counts, 50.0)
    sel_report = min(reports,
                     key=lambda d: abs(countStatements(d) * perc_50 - 1))
    print "\nReport %s selected (%i statements)" % (
        sel_report._id, countStatements(sel_report))
    # Select the slides closest to median # of extracted statements
    # (use a sort here, because there aren't too many slides docs)
    sel_slides = sorted(slides, key=countStatements)[len(slides) / 2]
    print "\nSlides %s selected (%i statements)" % (
        sel_slides._id, countStatements(sel_slides))

    # Judge the selected documents
    for sel_doc in (sel_post, sel_report, sel_slides):
        if countStatements(sel_doc, key='extracted') > 0:
            interactiveGroundTruth(sel_doc)
        else:
            print "Can't judge a doc with 0 statements; Check documents!"

    # Remove 'dev_truth' from the non-judged documents
    for doc in (posts + reports + slides):
        if doc not in (sel_post, sel_report, sel_slides):
            #print "Delete dev_truth for %s" % doc._id
            del doc.dev_truth
            doc.toMongo()
        else:
            print doc._id, "dev_truth kept!"
    print "\nAll dev_truths deleted for non-judged documents"
Exemplo n.º 10
0
def judgeShareworksPortfolio():
    # Load Shareworks Documents from Mongo
    posts = cpr.loadDocuments({'doctype': 'posts'})
    reports = cpr.loadDocuments({'doctype': 'report'})
    slides = cpr.loadDocuments({'doctype': 'slides'})
    # Make statements for all sw_docs
    has_dev_truth_count = 0
    overwrite = False
    while True: # this loop is a strange hack and needs refactoring
        print "\nMaking statements for sw_docs (overwrite=%s)" % overwrite
        if has_dev_truth_count < 0:
            has_dev_truth_count = -67
        for doc in (posts + reports + slides):
            # threshold for overwriting dev_truths
            if has_dev_truth_count > 10:
                print "\nAssuming docs are not judged; overwrite everything"
                overwrite = True
                has_dev_truth_count = -1
                break                
            if hasattr(doc, 'dev_truth') and not overwrite:
                has_dev_truth_count += 1
                print "!!%s already has a dev_truth; skipping" % doc._id
                continue # don't overwrite; might be judged already
            doc.dev_truth = {}
            doc.makeStatements()
        if not overwrite or has_dev_truth_count < -10:
            print "\nDone making statements!"
            break
    # Select the post closest to 15th percentile # of extracted statements
    post_counts = map(countStatements, posts)
    perc_15 = 1/numpy.percentile(post_counts, 15.0)
    sel_post = min(posts,
	       key=lambda d: abs(countStatements(d)*perc_15-1))
    print "\nPost %s selected (%i statements)" % (sel_post._id,
                                                  countStatements(sel_post))
    # Select the report closest to median # of extracted statements
    report_counts = map(countStatements, reports)
    perc_50 = 1/numpy.percentile(report_counts, 50.0)
    sel_report = min(reports,
	       key=lambda d: abs(countStatements(d)*perc_50-1))
    print "\nReport %s selected (%i statements)" % (sel_report._id,
                                                  countStatements(sel_report))
    # Select the slides closest to median # of extracted statements
    # (use a sort here, because there aren't too many slides docs)
    sel_slides = sorted(slides, key=countStatements)[len(slides)/2]
    print "\nSlides %s selected (%i statements)" % (sel_slides._id,
                                                  countStatements(sel_slides))

    # Judge the selected documents
    for sel_doc in (sel_post, sel_report, sel_slides):
        if countStatements(sel_doc, key='extracted') > 0:
            interactiveGroundTruth(sel_doc)
        else: print "Can't judge a doc with 0 statements; Check documents!"

    # Remove 'dev_truth' from the non-judged documents
    for doc in (posts + reports + slides):
        if doc not in (sel_post, sel_report, sel_slides):
            #print "Delete dev_truth for %s" % doc._id
            del doc.dev_truth
            doc.toMongo()
        else: print doc._id, "dev_truth kept!"
    print "\nAll dev_truths deleted for non-judged documents"