Exemplo n.º 1
0

def interface():
    args = argparse.ArgumentParser()
    args.add_argument("-i", "--inst-file", help="Institution profiles")
    args.add_argument("-f", "--faculty-file", help="Faculty profiles")
    args.add_argument("-g", "--gs-dir", help="Directory of GS profiles")
    args.add_argument("-d", "--dblp-dir", help="Directory of DBLP profiles")
    args = args.parse_args()
    return args


if __name__ == "__main__":
    args = interface()
    inst = institution_parser.parse_institution_records(open(args.inst_file, "rU"))
    faculty = load_assistant_profs(open(args.faculty_file, "rU"), inst)
    # gs_prefix = os.path.join(args.gs_dir, 'GSP_')
    dblp_prefix = os.path.join(args.dblp_dir, "DBLP_")

    for f in faculty:
        # Check for each profile, download if missing
        """
        if 'gs' in f:
            gs_file = os.path.join(args.gs_dir, GS_FILE % f['gs'])
            if not os.path.isfile(gs_file):
                print 'GS -> ', f['facultyName']
                download_all_gs_pages(f['gs'], gs_prefix)
        """

        if "dblp" in f:
            dblp_file = os.path.join(args.dblp_dir, DBLP_FILE % f["dblp"])
Exemplo n.º 2
0
            name = line.split(':', 1)[-1].strip()
            if name == next_name:
                output.write('# dblp_n      : %d\n' % num_papers[next_ind])
                output.write('# dblp_n_2011 : %d\n' %
                             num_papers_2011[next_ind])
                next_ind += 1
                if next_ind < max_ind:
                    next_name = names[next_ind]
                else:
                    done = True

    if not done:
        print 'WARNING: failed to link all z-scores!'

    output.close()


if __name__ == "__main__":
    args = interface()

    inst = institution_parser.parse_institution_records(open(args.inst_file))
    faculty = load_assistant_profs(open(args.faculty_file, 'rU'), inst)
    load.load_all_publications(faculty, args.dblp_dir, gs_dir=None)
    dists, tots = get_paper_counts_by_topic(faculty)
    means, stds = get_topic_means_stds(dists, tots)
    print means
    print stds
    set_zscores(faculty, means, stds)
    #add_zscores_to_file(faculty, args.faculty_file, args.output_file)
    add_counts_to_file(faculty, args.faculty_file, args.output_file)
    if len(w) < 3:
        return None
    return w


def add_words_from_title(words, title, stop_words, lem):
    for word in title.split():
        w = word_filter(word, stop_words, lem)
        if w:
            words.append(w)


if __name__ == "__main__":
    args = interface()

    faculty = load.load_assistant_profs(open(args.input_file))
    load.load_all_publications(faculty, args.dblp_dir, args.gs_dir)

    lem = WordNetLemmatizer()
    stop_words = stopwords.words('english')
    if args.custom_stops:
        custom_words = [
            word_filter(w.strip(), [], lem)
            for w in open(args.custom_stops, 'rU')
        ]
        stop_words += custom_words

    for f in faculty:
        tag = None

        if 'dblp_pubs' in f:
    # Finish writing the last record(s)
    if line:  # Didn't reach the end yet, so grab the next line
        line = in_fp.readline()
        while line:
            output.write(line)
            line = in_fp.readline()
                
    print 'Linked %d DBLP profiles and %d GS profiles in new output file' % (dblp_linked, gs_linked)
    output.close()


if __name__=="__main__":
    args = interface()
    
    faculty = load_assistant_profs(open(args.faculty_file, 'rU'))
    link_gs_profiles(faculty, args.gs_file)
    link_dblp_profiles(faculty, args.dblp_file)
    add_links_to_file(faculty, args.faculty_file, args.output_file)
    
    covered = 0
    both = 0
    for f in faculty:
        if 'gs' in f or 'dblp' in f:
            covered += 1
        else:
            print f['facultyName']
        if 'gs' in f and 'dblp' in f:
            both += 1
    print '%d of %d have at least one of the two profiles.' % (covered, len(faculty))
    print '%d of %d have both profiles.' % (both, len(faculty))
Exemplo n.º 5
0
    if len(w) < 3:
        return None
    return w


def add_words_from_title(words, title, stop_words, lem):
    for word in title.split():
        w = word_filter(word, stop_words, lem)
        if w:
            words.append(w)


if __name__=="__main__":
    args = interface()
    inst = institution_parser.parse_institution_records(open(args.inst_file))
    faculty = load.load_assistant_profs(open(args.fac_file), inst)
    load.load_all_publications(faculty, args.dblp_dir, args.gs_dir)

    lem = WordNetLemmatizer()
    stop_words = stopwords.words('english')
    if args.custom_stops:
        custom_words = [word_filter(w.strip(), [], lem) for w in open(args.custom_stops, 'rU')]
        stop_words += custom_words

    written = 0
    for f in faculty:
        tag = None
        words = []
    
        if 'dblp_pubs' in f:
            tag = f['dblp']
Exemplo n.º 6
0
    if len(w) < 3:
        return None
    return w


def add_words_from_title(words, title, stop_words, lem):
    for word in title.split():
        w = word_filter(word, stop_words, lem)
        if w:
            words.append(w)


if __name__ == "__main__":
    args = interface()
    inst = institution_parser.parse_institution_records(open(args.inst_file))
    faculty = load.load_assistant_profs(open(args.fac_file), inst)
    load.load_all_publications(faculty, args.dblp_dir, args.gs_dir)

    lem = WordNetLemmatizer()
    stop_words = stopwords.words('english')
    if args.custom_stops:
        custom_words = [
            word_filter(w.strip(), [], lem)
            for w in open(args.custom_stops, 'rU')
        ]
        stop_words += custom_words

    written = 0
    for f in faculty:
        tag = None
        words = []
Exemplo n.º 7
0
HS_INST_FILE = os.path.join(DATA_DIR, 'inst_hs_CURRENT.txt')

# Colors
ACCENT_COLOR_1 = np.array([176., 116., 232.]) / 255.

# Load the standard set of files
# Business
bs_inst = institution_parser.parse_institution_records(open(
    BS_INST_FILE, 'rU'))
all_bs_faculty = [
    person for person in faculty_parser.parse_faculty_records(
        open(BS_FACULTY_FILE, 'rU'), school_info=bs_inst, ranking='pi')
]
bs_faculty = load.load_assistant_profs(open(BS_FACULTY_FILE, 'rU'),
                                       school_info=bs_inst,
                                       ranking='pi',
                                       year_start=1970,
                                       year_stop=2012)
# bs_faculty_df = convert_faculty_list_to_df(bs_faculty)

# Computer Science
cs_inst = institution_parser.parse_institution_records(open(
    CS_INST_FILE, 'rU'))
all_cs_faculty = [
    person for person in faculty_parser.parse_faculty_records(
        open(CS_FACULTY_FILE, 'rU'), school_info=cs_inst, ranking='pi')
]
cs_faculty = load.load_assistant_profs(open(CS_FACULTY_FILE, 'rU'),
                                       school_info=cs_inst,
                                       ranking='pi',
                                       year_start=1970,
    if len(w) < 3:
        return None
    return w


def add_words_from_title(words, title, stop_words, lem):
    for word in title.split():
        w = word_filter(word, stop_words, lem)
        if w:
            words.append(w)


if __name__=="__main__":
    args = interface()
    
    faculty = load.load_assistant_profs(open(args.input_file))
    load.load_all_publications(faculty, args.dblp_dir, args.gs_dir)

    lem = WordNetLemmatizer()
    stop_words = stopwords.words('english')
    if args.custom_stops:
        custom_words = [word_filter(w.strip(), [], lem) for w in open(args.custom_stops, 'rU')]
        stop_words += custom_words

    for f in faculty:
        tag = None
    
        if 'dblp_pubs' in f:
            tag = f['dblp']
            try:
                title = np.random.choice(f['dblp_pubs'])['title']