gender = leGenderary(options) if __name__ == "__main__": # Parse command line arguments args = docopt(__doc__) inp_fn = args["--in"] out_fn = args["--out"] debug = args["--debug"] if debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) gender_dict = defaultdict(lambda: defaultdict(lambda: 0)) cnt = 0 with open(out_fn, 'w') as fout: for paper in tqdm(lazy_paper_reader(inp_fn)): non_empty_auth = [ author for author in paper["authors"] if gender.determineFirstName(author.split()) ] cnt += len(paper["authors"]) - len(non_empty_auth) paper["authors"] = non_empty_auth fout.write("{}\n".format(json.dumps(paper))) logging.info("Removed {} nameless authors".format(cnt)) logging.info("DONE")
""" Normalize a paper source. """ return source.lower().rstrip().lstrip() if __name__ == "__main__": # Parse command line arguments args = docopt(__doc__) inp_fn = args["--in"] out_fn = args["--out"] debug = args["--debug"] num_of_papers = int(args["--n"]) if args["--n"] is not None \ else None if debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) cnt = 0 with open(out_fn, 'w') as fout: for paper in tqdm(lazy_paper_reader(inp_fn), total=num_of_papers): if paper["year"] >= 1970: cnt += 1 fout.write("{}\n".format(json.dumps(paper))) logging.info("Wrote {} papers to {}".format(cnt, out_fn)) logging.info("DONE")
args = docopt(__doc__) inp_fn = args["--in"] out_fn = args["--out"] total = int(args["--total"]) if args["--total"] is not None \ else None debug = args["--debug"] if debug: logging.basicConfig(level = logging.DEBUG) else: logging.basicConfig(level = logging.INFO) gender_dict = defaultdict(lambda: defaultdict(lambda: 0)) gender_by_year = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0))) for paper in tqdm(lazy_paper_reader(inp_fn), total = total): for author in paper["authors"]: first_name = author["first_name"].strip() if first_name: gender_dict[author["gender"]][first_name] += 1 gender_by_year[author["gender"]][paper["year"]][first_name] += 1 for gender in gender_dict: cur_fn = os.path.join(out_fn, "{}.csv".format(gender)) logging.info("Writing to {}".format(cur_fn)) with open(cur_fn, 'w') as fout: fout.write('\n'.join(["{},{}".format(name, count) for (name, count) in sorted(gender_dict[gender].items(), key = itemgetter(1), reverse = True)]))
def normalize_source(source): """ Normalize a paper source. """ return source.lower().rstrip().lstrip() if __name__ == "__main__": # Parse command line arguments args = docopt(__doc__) inp_fn = args["--in"] out_fn = args["--out"] debug = args["--debug"] if debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) cnt = 0 with open(out_fn, 'w') as fout: for fn in glob(os.path.join(inp_fn, "s2-corpus-*")): logging.info("Filtering {}...".format(fn)) for paper in tqdm(lazy_paper_reader(fn)): if "medline" in map(normalize_source, paper["sources"]): cnt += 1 fout.write("{}\n".format(json.dumps(paper))) logging.info("Wrote {} papers to {}".format(cnt, out_fn)) logging.info("DONE")
# Parse command line arguments args = docopt(__doc__) inp_fn = args["--in"] out_fn = args["--out"] debug = args["--debug"] filter_flag = args["--filter"] if debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) logging.info("Reading papers from {}".format(inp_fn)) if filter_flag: papers = [] for paper in tqdm(lazy_paper_reader(inp_fn), total=12491238): if (any([author["gender"] != "unknown" for author in paper["authors"]])) and \ ((paper["year"] == 2001) or (paper["year"] == 2002)): papers.append(paper) else: papers = list(lazy_paper_reader(inp_fn)) papers_by_venues = defaultdict(list) for paper in papers: papers_by_venues[paper["venue"]].append(paper) logging.info("Checking male author count...") sorted_male_venues = sorted( [(venue, is_problematic_venue(venue_papers)) for venue, venue_papers in tqdm(papers_by_venues.iteritems(), total=len(papers_by_venues))],
from tqdm import tqdm # Local imports from sqlite_manager import Sqlite_Database from add_gender import lazy_paper_reader #=----- if __name__ == "__main__": # Parse command line arguments args = docopt(__doc__) db_fn = args["--db"] json_fn = args["--json"] debug = args["--debug"] if debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) with Sqlite_Database(db_fn) as db: for paper in tqdm(lazy_paper_reader(json_fn)): json_genders = [author['gender'] for author in paper['authors']] json_authors = [author['name'] for author in paper['authors']] db_genders, db_authors = db.get_paper_genders(paper['id']) assert (db_genders == json_genders) assert (db_authors == json_authors) logging.info("DONE")