Exemplo n.º 1
0
def analyze(ss, cfg):
    """
    Run job 
    :param ss: SparkSession
    :param cfg: app configuration
    :return: None
    """

    logger = logging.getLogger(__name__)
    logger.info('Python version: {}'.format(sys.version))
    logger.info(
        'Extracting dataset of MAG institution ids mapped to their latitude and longitude values.'
    )

    # MAG dataset to use
    db_name = cfg['mag_db_name']

    # tables that will be used =============================================== #
    aff_df = ss.table(
        db_name +
        '.affiliations')  # this will be used to identify institutions
    aff_lat_long_df = aff_df.select('affiliationid', 'latitude', 'longitude')

    # Lets write the dataset to file
    LogUtils().describe_df(aff_lat_long_df, 'Affiliation lat long dataset')
    output_filename = join(cfg['hdfs']['onmerrit_dir'],
                           'affiliation_id_lat_long.csv')
    aff_lat_long_df.write.csv(output_filename,
                              mode="overwrite",
                              header=True,
                              sep=",",
                              quoteAll=True)
    logger.info("\n\n\nWrote the affiliation_id_lat_long dataset to file " +
                output_filename)
Exemplo n.º 2
0
def analyze(sc, cfg):
    """
    Run job 
    :param sc: SparkContext
    :param cfg: app configuration
    :return: None
    """
    logger = logging.getLogger(__name__)
    logger.info('Python version: {}'.format(sys.version))
    logger.info('Counting words...')

    # needs to be initialized (even if not used) to be able
    # to work with DataFrames
    sql_sc = SQLContext(sc)

    core_dir = cfg['hdfs']['core_dir']

    text_01 = (
        f'CORE’s mission is to aggregate all open access research outputs from '
        f'repositories and journals worldwide and make them available to the '
        f'public. In this way CORE facilitates free unrestricted access to '
        f'research for all.')
    text_02 = (
        f'CORE harvests research papers from data providers from all over the'
        f'world including institutional and subject repositories, open access '
        f'and hybrid journal publishers.')
    text_03 = (
        f'CORE currently contains 135,539,113 open access articles, from '
        f'thousands and over tens of thousands of journals, collected from '
        f'5,969 data providers around the world.')
    text_04 = (
        f'CORE will supply data for the UK REF 2021 Open Access Policy Audit '
        f'to Research England. We provide advice and support to UK HEIs with '
        f'exposing their outputs data to CORE.')

    texts = [text_01, text_02, text_03, text_04]
    words = sc.parallelize(texts).flatMap(lambda text: text.split())
    words = words.map(lambda word: (word, 1))
    counts = words.reduceByKey(lambda a, b: a + b)
    ordered = counts.sortBy(lambda pair: pair[1], ascending=False)
    ordered = ordered.toDF(['word', 'count'])
    LogUtils().describe_df(ordered, 'word_count')
    output_path = join(core_dir, 'test_job_output')
    logger.info(f'Storing results in {output_path}')
    ordered.coalesce(1).write.csv(output_path, mode='overwrite')
Exemplo n.º 3
0
def analyze(ss, cfg):
    """
    Run job
    :param ss: SparkSession
    :param cfg: app configuration
    :return: None
    """

    logger = logging.getLogger(__name__)
    logger.info('Python version: {}'.format(sys.version))
    logger.info(
        'Extracting dataset of all papers published by all institution in the countries of our choice and divided into disciplines.'
    )

    # MAG dataset to use
    db_name = cfg['mag_db_name']

    # tables that will be used =============================================== #
    paper_fieldsofStudy = ss.table(
        db_name + '.paperfieldsofstudy'
    )  # this will give the domain id of the paper -- domain id for agriculture, climate, health or something else
    '''aff_df = ss.table(db_name + '.affiliations')  # this will be used to identify institutions
    paper_author_aff_df = ss.table(db_name + '.paperauthoraffiliations')  # this will be used to identify papers coming from an institution
    paper_df = ss.table(db_name + '.papers')  # this will give the details of a paper.
    fieldsofstudy_df = ss.table(db_name+'.fieldsofstudy') # this contains the mapping of domain id to domain name.
    fieldofstudychildren_df = ss.table(db_name + '.fieldofstudychildren')  # this contains the subdomain for given domain.'''

    fos_hierarchy_df = ss.read.csv(join(cfg['hdfs']['onmerrit_dir'],
                                        'fos_hierarchy.csv'),
                                   header=True,
                                   mode="DROPMALFORMED")
    disciplines_ids = {}
    for discipline in cfg['data']['fields_of_study']:
        discipline_df = fos_hierarchy_df.filter(
            (fos_hierarchy_df.normalizedname == discipline)).select(
                'fieldofstudyid', 'child_ids')

        child_fields_ids = [
            row.child_ids.split(",")
            for row in discipline_df.select('child_ids').collect()
        ]  # This gives list of list of strings
        child_fields_ids = list(
            chain.from_iterable(child_fields_ids)
        )  # This flattens the 2d list into 1D. https://stackoverflow.com/a/29244327/530399
        child_fields_ids = [int(x) for x in child_fields_ids]

        field_ids = [
            row.fieldofstudyid.split(",")
            for row in discipline_df.select('fieldofstudyid').collect()
        ]  # This gives list of list of strings
        field_ids = list(
            chain.from_iterable(field_ids)
        )  # This flattens the 2d list into 1D. https://stackoverflow.com/a/29244327/530399
        field_ids = [int(x) for x in field_ids]

        discipline_field_ids = child_fields_ids + field_ids
        disciplines_ids[discipline] = discipline_field_ids

    for country_name, _ in cfg['data']['institutions_by_country'].items():

        country_papers_df = ss.read.csv(join(cfg['hdfs']['onmerrit_dir'],
                                             country_name + '_papers.csv'),
                                        header=True,
                                        mode="DROPMALFORMED")

        country_papers_fos_df = country_papers_df.join(
            paper_fieldsofStudy, country_papers_df.paperid ==
            paper_fieldsofStudy.paperid)  # inner join
        country_papers_fos_df = country_papers_fos_df.drop(
            paper_fieldsofStudy.paperid)  # keep only the necessary fields
        LogUtils().describe_df(country_papers_df, country_name + ' Papers df')

        # Lets find the subset of papers that belong to each of our disciplines
        for discipline in cfg['data']['fields_of_study']:
            discipline_ids = disciplines_ids[discipline]

            country_papers_discipline_df = country_papers_fos_df.filter(
                F.col("fieldofstudyid").isin(discipline_ids))
            x_len = country_papers_discipline_df.count()

            # Only keep those papers that are identified to belong to the discipline with a score greater than 0.3 -- others are probably false positives
            country_papers_discipline_df = country_papers_discipline_df.filter(
                F.col("score") > 0.3)
            y_len = country_papers_discipline_df.count()
            logger.info("Removed " + str(x_len - y_len) + " entries for " +
                        country_name + " papers in discipline " + discipline +
                        " with score of less than 0.3")

            # Remove duplicates within this discipline
            country_papers_discipline_df = country_papers_discipline_df.dropDuplicates(
                ['paperid'])
            z_len = country_papers_discipline_df.count()
            logger.info("Removed " + str(y_len - z_len) + " duplicates for " +
                        country_name + " papers in discipline " + discipline)

            # Write the current dataset to file
            output_filename = join(
                cfg['hdfs']['onmerrit_dir'],
                country_name + "_" + discipline + "_papers.csv")
            country_papers_discipline_df.write.csv(output_filename,
                                                   mode="overwrite",
                                                   header=True,
                                                   sep=",",
                                                   quoteAll=True)
            logger.info("\n\nWrote dataset for " + discipline +
                        " discipline for  country : " + country_name +
                        " to file " + output_filename + "\n\n")
Exemplo n.º 4
0
def analyze(ss, cfg):
    """
    Run job
    :param ss: SparkSession
    :param cfg: app configuration
    :return: None
    """

    logger = logging.getLogger(__name__)
    logger.info('Python version: {}'.format(sys.version))
    logger.info(
        'Extracting dataset of OA status of all papers published by all THE WUR universities in the countries of our choice.'
    )

    # MAG dataset to use
    db_name = cfg['mag_db_name']

    # datasets that will be used =============================================== #
    mucc_df = ss.read.parquet(cfg['hdfs']['mucc_dir']).select(
        ['paperid', 'link'])  # this will be used to determine OA status.
    # The mucc dataset doesn't have distinct entries for the ['paperid','link'] subset; i.e. the same paperid could have different/multiple links or even no link vs some link. I will infer that the paper is OA is there is atleast one link.
    # just retain what are the distinct combinations of subset ['paperid','link']
    mucc_df = mucc_df.distinct()
    # Since the same paper id may have multiple OA links (or marked with null value), lets create a single entry with list of all such links in one row.
    mucc_df = mucc_df.groupby("paperid").agg(
        F.collect_list("link").alias("list_links"))
    # Add a column for OA flag
    oa_flag = F.when(F.size("list_links") > 0, True).otherwise(
        False
    )  # considered to be OA is any one link is found. F.size gives the length od the list.
    mucc_df = mucc_df.withColumn("is_OA", oa_flag)

    # tables that will be used
    papers_df = ss.table(db_name + '.papers').select(
        ['paperid', 'year']).drop_duplicates(
        )  # this will be used to extract publication year of the papers.

    total_result_df = None

    for country_name, univ_names in cfg['data'][
            'all_THE_institutions_by_country'].items():

        country_papers_df = ss.read.csv(join(cfg['hdfs']['onmerrit_dir'],
                                             country_name + '_papers.csv'),
                                        header=True,
                                        mode="DROPMALFORMED")
        logger.info("\n\n\nProcessing dataset of papers from " + country_name)

        country_papers_oa_df = country_papers_df.join(
            mucc_df, ['paperid'], how='left_outer'
        )  # left outer join so as to preserve all and only papers of the country in the result. inner join won't be good because (currently) mucc does not have entries for some paperids

        country_papers_oa_df = country_papers_oa_df.join(
            papers_df, ['paperid'], how='inner'
        )  # add in the information about publication year of each papers.

        # save the data for the current country
        output_filename = join(cfg['hdfs']['onmerrit_dir'],
                               "OA_status_" + country_name + "_papers.csv")
        # csv format doesn't support writing arrays; need to be converted to string representation
        country_papers_oa_df = country_papers_oa_df.withColumn(
            'url_lists_as_string',
            array_to_string_udf(country_papers_oa_df["list_links"]))
        country_papers_oa_df = country_papers_oa_df.drop("list_links")

        country_papers_oa_df.write.csv(output_filename,
                                       mode="overwrite",
                                       header=True,
                                       sep=",",
                                       quoteAll=True)
        logger.info("\n\nWrote dataset for country : " + country_name +
                    " to file " + output_filename + "\n\n")

        # Update the total_result with info from this loop. Need to add the relevant country name.
        country_papers_oa_df = country_papers_oa_df.withColumn(
            'country_name', F.lit(country_name))

        if total_result_df is None:
            total_result_df = country_papers_oa_df
        else:
            # https://datascience.stackexchange.com/a/27231
            total_result_df = total_result_df.union(
                country_papers_oa_df.select(total_result_df.columns))

    # Lets write the total dataset to file
    LogUtils().describe_df(
        total_result_df, 'Countries All Papers with OA status info dataset ')
    total_output_filename = join(cfg['hdfs']['onmerrit_dir'],
                                 'OA_status_all_countries_all_papers.csv')
    total_result_df.write.csv(total_output_filename,
                              mode="overwrite",
                              header=True,
                              sep=",",
                              quoteAll=True)
    logger.info("\n\n\nWrote the affiliation_id_lat_long dataset to file " +
                total_output_filename)
Exemplo n.º 5
0
def analyze(ss, cfg):
    """
    Run job
    :param ss: SparkSession
    :param cfg: app configuration
    :return: None
    """

    logger = logging.getLogger(__name__)
    logger.info('Python version: {}'.format(sys.version))
    logger.info(
        'Extracting dataset of all papers published by all institution in the countries of our choice.'
    )

    # MAG dataset to use
    db_name = cfg['mag_db_name']

    # tables that will be used =============================================== #
    aff_df = ss.table(
        db_name +
        '.affiliations')  # this will be used to identify institutions names
    paper_author_aff_df = ss.table(
        db_name + '.paperauthoraffiliations'
    )  # this will be used to identify papers coming from an institution
    '''paper_df = ss.table(db_name + '.papers')  # this will give the details of a paper.
    paper_fieldsofStudy = ss.table(db_name+'.paperfieldsofstudy')  # this will give the domain id of the paper -- domain id for agriculture, climate, health or something else
    fieldsofstudy_df = ss.table(db_name+'.fieldsofstudy') # this contains the mapping of domain id to domain name.
    fieldofstudychildren_df = ss.table(db_name + '.fieldofstudychildren')  # this contains the subdomain for given domain.'''

    affiliation_countryname_df = ss.read.csv(join(cfg['hdfs']['onmerrit_dir'],
                                                  'affiliations_country.csv'),
                                             header=True,
                                             mode="DROPMALFORMED")
    LogUtils().describe_df(affiliation_countryname_df,
                           'Affiliations country name dataset')

    total_result_df = None

    for country_name, _ in cfg['data']['institutions_by_country'].items():
        cnames = cfg['data']['country_names_variations'][
            country_name]  # possible variations of country names
        cnames.append(country_name)
        logger.info("\n\n\nPossible name variations for " + country_name +
                    " = " + str(cnames))
        all_institutions_ids_df = affiliation_countryname_df.filter(
            F.col("country").isin(cnames)).select("affiliationid").distinct(
            )  # all institutions within the country

        country_papers_df = all_institutions_ids_df.join(
            paper_author_aff_df, all_institutions_ids_df.affiliationid ==
            paper_author_aff_df.affiliationid)  # inner join
        country_papers_df = country_papers_df.select(
            'paperid', paper_author_aff_df.affiliationid
        )  # keep only the necessary fields
        # same paper could have multiple authors within the same institution and therefore have multiple entries in paper_author_aff_df. Need to get rid of such duplicate entries.
        # This will however preserve records with same paperid but different affilaitionid (when the same paper is written by authors from different univs), which is desired.
        country_papers_df = country_papers_df.dropDuplicates()

        # To get back the name of the institutions
        country_papers_institution_df = country_papers_df.join(
            aff_df, country_papers_df.affiliationid ==
            aff_df.affiliationid)  # inner join
        # keep only the necessary fields.
        '''I keep wikipage because they might contain english names for non-english universities. Ex: 
        Universidade Federal de Ciências da Saúde de Porto Alegre has wikipage http://en.wikipedia.org/wiki/Federal_University_of_Health_Sciences_of_Porto_Alegre
        '''
        country_papers_institution_df = country_papers_institution_df.select(
            'paperid', country_papers_df.affiliationid, 'normalizedname',
            'displayname', 'wikipage')

        # save the data for the current country
        output_filename = join(cfg['hdfs']['onmerrit_dir'],
                               country_name + "_papers.csv")
        country_papers_institution_df.write.csv(output_filename,
                                                mode="overwrite",
                                                header=True,
                                                sep=",",
                                                quoteAll=True)
        logger.info("\n\nWrote dataset for country : " + country_name +
                    " to file " + output_filename + "\n\n")

        # Update the total_result with info from this loop. Need to add the relevant info.
        country_papers_institution_df = country_papers_institution_df.withColumn(
            'country_name', F.lit(country_name))

        if total_result_df is None:
            total_result_df = country_papers_institution_df
        else:
            # https://datascience.stackexchange.com/a/27231
            total_result_df = total_result_df.union(
                country_papers_institution_df.select(total_result_df.columns))

    # Lets write the total dataset to file
    LogUtils().describe_df(
        total_result_df, 'Countries All Papers with affiliation info dataset ')
    total_output_filename = join(cfg['hdfs']['onmerrit_dir'],
                                 'all_countries_all_papers.csv')
    total_result_df.write.csv(total_output_filename,
                              mode="overwrite",
                              header=True,
                              sep=",",
                              quoteAll=True)
    logger.info("\n\n\nWrote the affiliation_id_lat_long dataset to file " +
                total_output_filename)