예제 #1
0
def test_main():
    # extract REF data
    ref_inst = parse(field = 0)
    ref_title = parse(field = 5)
    ref_venue = parse(field = 6)
    ref_year = parse(field = 17)

    ref_descriptions = parse(field = 30)
    ref_topics = []
    for descriptor in ref_descriptions:
        topic = re.search("<([0-9]+)>", descriptor)
        if topic is not None:
            ref_topics.append(int(topic.group(1)))
        else:
            ref_topics.append(None)

    ref_papers = sorted(zip(ref_inst, ref_title, ref_venue, ref_year, ref_topics)[1:]) # sort by ref_inst

    # Extract DBLP selected data
    dblp_data = file("data/selectfiles.dat", "rb").read()
    dblp_data = unpackb(dblp_data)
    dblp_titles = [title for (authors, title, booktitle, year) in dblp_data]
    all_l = len(dblp_data)

    # inst titles
    inst_titles = parse_institutions("data/Institution.csv")

    # Extract ref author data
    author_data = sorted(parse_authors("data/Staff.csv"))
    author_map = {}
    for k, g in groupby(author_data, FIRST):
        author_map[k] = list(g)


    P = ProjectedStrings(dblp_titles, l = (3,5), n_comp=30)
    P.threshold = 0.3

    matched_papers = {}
    matched_people = {}

    author_list = []
    paper_list = []

    for k, g in groupby(ref_papers, FIRST):
        print
        print k, inst_titles[k]
        matched_papers[k] = [0, 0]
        inst_authors = Counter()
        for inst, titl, venu, yr, topic in g:
            matched_papers[k][0] += 1
            #if len(list(P.matches(titl))) == 0:
            #    unmatched_papers[k][1] += 1
            matches_flag = list(P.matches(titl))
            if len(matches_flag) > 0:
                matched_papers[k][1] += 1
                paper_list.append((k, titl, dblp_data[matches_flag[0][0]]))
            #print ">", titl
            for i, mx, title in matches_flag:
                #matched_papers[k][1] += 1
                #print "(%2.2f) %s" % (mx, title)
                #print ", ".join(dblp_data[i][0])
                inst_authors.update(dblp_data[i][0])
            #print

        diverse_names = sum([diversify_name(a) for a in inst_authors], [])
        just_names = [n1 for n1, _ in diverse_names]

        Pauths = ProjectedStrings(just_names, l=(1,3),  n_comp=30)
        Pauths.threshold = 0.45

        matched_people[k] = [0, 0]
        for _, surname, initials in author_map[k]:
            matched_people[k][0] += 1
            # Normalize a bit
            surname1 = surname.lower().translate(None, ' .-,').strip()
            initials1 = initials.lower().translate(None, ' .-,').strip()
            new_name = initials1 + " " + surname1
            matches = sorted(list(Pauths.matches(new_name, k=5)), key=lambda x:x[1], reverse=True)

            # Find a way to break high-ties
            strong_matches = [(idx, mx * inst_authors[diverse_names[idx][1]], diverse_names[idx][1]) for (idx, mx, name) in matches if mx > 0.40]
            strong_matches = [(idx, mx, name) for (idx, mx, name) in strong_matches if mx > 0.55]
            strong_matches = sorted(strong_matches, key=lambda x:x[1], reverse=True)

            if len(strong_matches) > 0:
                matched_people[k][1] += 1
                author_list.append((k, "%s %s" % (initials, surname), strong_matches[0][2]))
                print "%2.2f | %s %s | %s" % (strong_matches[0][1], initials, surname, strong_matches[0][2])
            else:
                print "%s | %s %s | %s" % ("***", initials, surname, "")
                print matches

    print "Packing data"
    packed_authors = packb(author_list, use_bin_type=True)
    file("data/author_list.dat", "wb").write(packed_authors)

    packed_papers = packb(paper_list, use_bin_type=True)
    file("data/paper_list.dat", "wb").write(packed_papers)

    fo = open("results/match_quality.txt", "w")
    print  >>fo, "People and Papers matched"
    for k, (v1, v2) in matched_people.iteritems():
        (m1, m2) = matched_papers[k]
        print >> fo, "%2.2f%% | %2.2f%% | %s" % ( 100* float(v2) / float(v1), 100* float(m2) / float(m1), inst_titles[k])
예제 #2
0
from extractrefdata import ProjectedStrings, parse
from msgpack import unpackb, packb
import random
from collections import defaultdict

# Get the REF data
datas = parse()
dates = parse(field=17)
p = ProjectedStrings(datas, l=(3, 5))
p.threshold = 0.30

dblp_data = file("data/allfiles.dat", "rb").read()
dblp_data = unpackb(dblp_data)
all_l = len(dblp_data)

venue_stats = defaultdict(int)

# Time
old = 0
new_dblp_list = []
print "starting ..."
for l, (authors, title, booktitle, year) in enumerate(dblp_data):

    # Gather some generic statistics about the venue_stats
    if 2009 <= int(year) <= 2014:
        venue_stats[booktitle] += 1

    m = list(p.matches(title, k=10))
    if m == []:
        continue