示例#1
0
gender = leGenderary(options)

if __name__ == "__main__":

    # Parse command line arguments
    args = docopt(__doc__)
    inp_fn = args["--in"]
    out_fn = args["--out"]
    debug = args["--debug"]
    if debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    gender_dict = defaultdict(lambda: defaultdict(lambda: 0))

    cnt = 0

    with open(out_fn, 'w') as fout:
        for paper in tqdm(lazy_paper_reader(inp_fn)):
            non_empty_auth = [
                author for author in paper["authors"]
                if gender.determineFirstName(author.split())
            ]
            cnt += len(paper["authors"]) - len(non_empty_auth)
            paper["authors"] = non_empty_auth
            fout.write("{}\n".format(json.dumps(paper)))

    logging.info("Removed {} nameless authors".format(cnt))
    logging.info("DONE")
示例#2
0
    """
    Normalize a paper source.
    """
    return source.lower().rstrip().lstrip()


if __name__ == "__main__":

    # Parse command line arguments
    args = docopt(__doc__)
    inp_fn = args["--in"]
    out_fn = args["--out"]
    debug = args["--debug"]
    num_of_papers = int(args["--n"]) if args["--n"] is not None \
                    else None

    if debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    cnt = 0
    with open(out_fn, 'w') as fout:
        for paper in tqdm(lazy_paper_reader(inp_fn), total=num_of_papers):
            if paper["year"] >= 1970:
                cnt += 1
                fout.write("{}\n".format(json.dumps(paper)))

    logging.info("Wrote {} papers to {}".format(cnt, out_fn))
    logging.info("DONE")
    args = docopt(__doc__)
    inp_fn = args["--in"]
    out_fn = args["--out"]
    total = int(args["--total"]) if args["--total"] is not None \
            else None

    debug = args["--debug"]
    if debug:
        logging.basicConfig(level = logging.DEBUG)
    else:
        logging.basicConfig(level = logging.INFO)

    gender_dict = defaultdict(lambda: defaultdict(lambda: 0))
    gender_by_year = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0)))

    for paper in tqdm(lazy_paper_reader(inp_fn), total = total):
        for author in paper["authors"]:
            first_name = author["first_name"].strip()
            if first_name:
                gender_dict[author["gender"]][first_name] += 1
                gender_by_year[author["gender"]][paper["year"]][first_name] += 1

    for gender in gender_dict:
        cur_fn = os.path.join(out_fn, "{}.csv".format(gender))
        logging.info("Writing to {}".format(cur_fn))
        with open(cur_fn, 'w') as fout:
            fout.write('\n'.join(["{},{}".format(name, count)
                                  for (name, count)
                                  in sorted(gender_dict[gender].items(),
                                            key = itemgetter(1),
                                            reverse = True)]))
示例#4
0
def normalize_source(source):
    """
    Normalize a paper source.
    """
    return source.lower().rstrip().lstrip()


if __name__ == "__main__":

    # Parse command line arguments
    args = docopt(__doc__)
    inp_fn = args["--in"]
    out_fn = args["--out"]
    debug = args["--debug"]
    if debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    cnt = 0
    with open(out_fn, 'w') as fout:
        for fn in glob(os.path.join(inp_fn, "s2-corpus-*")):
            logging.info("Filtering {}...".format(fn))
            for paper in tqdm(lazy_paper_reader(fn)):
                if "medline" in map(normalize_source, paper["sources"]):
                    cnt += 1
                    fout.write("{}\n".format(json.dumps(paper)))

    logging.info("Wrote {} papers to {}".format(cnt, out_fn))
    logging.info("DONE")
示例#5
0
    # Parse command line arguments
    args = docopt(__doc__)
    inp_fn = args["--in"]
    out_fn = args["--out"]
    debug = args["--debug"]
    filter_flag = args["--filter"]
    if debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    logging.info("Reading papers from {}".format(inp_fn))
    if filter_flag:
        papers = []
        for paper in tqdm(lazy_paper_reader(inp_fn), total=12491238):
            if (any([author["gender"] != "unknown"
                     for author in paper["authors"]])) and \
                         ((paper["year"] == 2001) or (paper["year"] == 2002)):
                papers.append(paper)
    else:
        papers = list(lazy_paper_reader(inp_fn))
        papers_by_venues = defaultdict(list)
        for paper in papers:
            papers_by_venues[paper["venue"]].append(paper)

    logging.info("Checking male author count...")
    sorted_male_venues = sorted(
        [(venue, is_problematic_venue(venue_papers))
         for venue, venue_papers in tqdm(papers_by_venues.iteritems(),
                                         total=len(papers_by_venues))],
示例#6
0
from tqdm import tqdm

# Local imports
from sqlite_manager import Sqlite_Database
from add_gender import lazy_paper_reader
#=-----

if __name__ == "__main__":

    # Parse command line arguments
    args = docopt(__doc__)
    db_fn = args["--db"]
    json_fn = args["--json"]
    debug = args["--debug"]
    if debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    with Sqlite_Database(db_fn) as db:
        for paper in tqdm(lazy_paper_reader(json_fn)):
            json_genders = [author['gender'] for author in paper['authors']]
            json_authors = [author['name'] for author in paper['authors']]

            db_genders, db_authors = db.get_paper_genders(paper['id'])

            assert (db_genders == json_genders)
            assert (db_authors == json_authors)

        logging.info("DONE")