def _papers_citations_number_by_year_sframe(without_self_citation=True): """ Get papers total number of citation in each year :param without_self_citation: if True calculate only non-self citations, other calculate with self-citations :return: SFrame with a column that contains citations_dict by year """ logger.info("Creating Paper Citations by Year (without_self_citation=%s)" % without_self_citation) ref_sf = tc.load_sframe(EXTENDED_PAPER_REFERENCES_SFRAME) if without_self_citation: ref_sf = ref_sf[ref_sf['self citation'] == 0] sf = tc.load_sframe(PAPERS_SFRAME)["Paper ID", "Paper publish year"] sf = ref_sf.join(sf, on="Paper ID") g = sf.groupby(["Paper reference ID", "Paper publish year"], {"Citation Number": agg.COUNT()}) g = g.rename({ "Paper publish year": "Year", "Paper reference ID": "Paper ID" }) g['Citation by Year'] = g.apply(lambda r: (r["Year"], r["Citation Number"])) h = g.groupby( 'Paper ID', {'Citation by Years': tc.aggregate.CONCAT('Citation by Year')}) if without_self_citation: h['Total Citations by Year without Self Citations'] = h[ 'Citation by Years'].apply( lambda l: _get_total_citation_by_year(l)) else: h['Total Citations by Year'] = h['Citation by Years'].apply( lambda l: _get_total_citation_by_year(l)) h = h.remove_column("Citation by Years") return h
def get_papers_ids_dict(self, venue_id, venue_name, venue_type=VenueType.journal, issn_list=()): """ Returns the venue's paper ids both that appear in the MAG paper dataset and in the AMingerMag join dataset :param venue_type: :param venue_id: the MAG venue id :param venue_name: the venue's name :param issn_list: ISSNs list :return: dict with the venue's papers ids. The dict has two keys 'papers_ids' & 'join_papers_ids' :rtyoe: dict :note: ISSN format \d{4}-\d{4} (with '-') """ logger.info( f"Getting papers id of venue_id={venue_id},venue_name={venue_name}. and issn_list={issn_list}" ) papers_ids_dict = { 'papers_ids': self._get_papers_ids(venue_id, venue_name, venue_type) } l = self._get_papers_ids(venue_id, venue_name, venue_type, use_join_col=True) for issn in issn_list: l += [ j['MAG Paper ID'] for j in self._papers_join_collection.find({"issn": issn}) ] papers_ids_dict['join_papers_ids'] = list(set(l)) return papers_ids_dict
def create_references_sframe(): """Creating the references SFrame from txt files""" logger.info("Creating References SFrame") if os.path.isdir(PAPER_REFERENCES_SFRAME): return sf = tc.SFrame.read_csv(PAPER_REFERENCES_TXT, header=False, delimiter="\t") sf = sf.rename({"X1": "Paper ID", "X2": "Paper reference ID"}) sf.save(PAPER_REFERENCES_SFRAME)
def create_references_count_sframe(): """Creating SFrame with the number of references in each paper""" logger.info("Creating References Count SFrame") if os.path.isdir(PAPER_REFERENCES_COUNT_SFRAME): return r_sf = tc.load_sframe(PAPER_REFERENCES_SFRAME) sf = r_sf.groupby("Paper ID", {"Ref Number": agg.COUNT()}) sf.save(PAPER_REFERENCES_COUNT_SFRAME)
def create_fields_of_study_sframe(): """ Creating Field of study SFrame from txt files """ logger.info("Creating Fields of Study SFrame") if os.path.isdir(FIELDS_OF_STUDY_SFRAME): return sf = tc.SFrame.read_csv(FIELDS_OF_STUDY_TXT, header=False, delimiter="\t") sf = sf.rename({"X1": "Field of study ID", "X2": "Field of study name"}) sf.save(FIELDS_OF_STUDY_SFRAME)
def create_aminer_sframe(): """ Create AMiner Papers sFrame from the AMiner text files. After creating the SFrame, it is save to AMINER_PAPERS_SFRAME """ logger.info("Creating AMiner Papers SFrame") if os.path.isdir(AMINER_PAPERS_SFRAME): return sf = tc.SFrame.read_json(AMINER_TXT_FILES, orient='lines') sf.save(AMINER_PAPERS_SFRAME)
def update_journals_features(): for i in os.listdir("/data/sframes/journals"): try: logger.info(f"Updating {i}") jid = i.split(".sframe")[0] va = VenueAnalyzer(jid, VenueType.journal, None) va.update_venue_stats() except Exception as e: print(e.message) logger.error(e.message)
def create_paper_keywords_list_sframe(): """ Creating Paper Keywords List SFrame """ logger.info("Creating Papers' Keywords List SFrame") if os.path.isdir(PAPER_KEYWORDS_LIST_SFRAME): return sf = tc.load_sframe(PAPER_KEYWORDS_SFRAME) g = sf.groupby("Paper ID", {"Keywords List": agg.CONCAT("Keyword name")}) g.save(PAPER_KEYWORDS_LIST_SFRAME)
def create_urls_sframe(): """ Creating URLs SFrame from txt files """ logger.info("Creating urls SFrame") if os.path.isdir(PAPER_URLS_SFRAME): return sf = tc.SFrame.read_csv(PAPER_URLS_TXT, header=False, delimiter="\t") sf = sf.rename({"X1": "Paper ID", "X2": "Url"}) g = sf.groupby("Paper ID", {"Urls": agg.CONCAT("Url")}) g.save(PAPER_URLS_SFRAME)
def create_authors_names_sframe(): """ Creates authors names SFrames from txt files """ logger.info("Creating Authors Names SFrame") if os.path.isdir(AUTHORS_NAMES_TXT): return a_sf = tc.SFrame.read_csv(AUTHORS_NAMES_TXT, header=False, delimiter="\t") a_sf = a_sf.rename({'X1': 'Author ID', 'X2': 'Author name'}) a_sf['First name'] = a_sf['Author name'].apply(lambda s: s.split()[0]) a_sf['Last name'] = a_sf['Author name'].apply(lambda s: s.split()[-1]) a_sf.save(AUTHOR_NAMES_SFRAME)
def update_venue_stats(self): j = json.load( open(f"/data/json/journals/{self.name} ({self.venue_id}).json", "r")) logger.info(f"update features of {self.name}") j["median_number_of_authors_by_year"] = self.get_venue_median_number_of_authors_by_year( ) j["number_of_papers_in_a_year"] = self.get_number_of_papers_by_year() json.dump( j, open(f"/data/json/journals/{self.name} ({self.venue_id}).json", "w"))
def create_keywords_sframe(): """ Creating Keywords SFrame from txt files """ logger.info("Creating Keywords SFrame") if os.path.isdir(PAPER_KEYWORDS_SFRAME): return sf = tc.SFrame.read_csv(PAPER_KEYWORDS_TXT, header=False, delimiter="\t") sf = sf.rename({ "X1": "Paper ID", "X2": "Keyword name", "X3": "Field of study ID mapped to keyword" }) sf.save(PAPER_KEYWORDS_SFRAME)
def get_authors_papers_dict_sframe(self): """ Create SFrame in which each row contains an AuthorId and a dict with the author's publication by year dict :return: SFrame with Authors ID and Papers by Years Dict columns :rtype: tc.SFrame """ logger.info("Calcualting authors' papers by year") a_sf = self.paper_authors_years a_sf['Paper Year'] = a_sf.apply(lambda r: (r["Year"], r["PaperId"])) g = a_sf.groupby("AuthorId", {"Papers List": agg.CONCAT("Paper Year")}) g['Papers by Years Dict'] = g["Papers List"].apply( lambda l: _entities_years_list_to_dict(l)) g = g.remove_column("Papers List") return g
def get_all_journals_features(): academic_birthyear_dict = pickle.load( open(JOURNAL_AUTHORS_ACADEMIC_BIRTHYEAR_PKL, "rb")) for i in os.listdir("/data/sframes/journals"): logger.info(f"Analyzing {i}") try: jid = i.split(".sframe")[0] va = VenueAnalyzer(jid, VenueType.journal, academic_birthyear_dict) j = va.get_venue_stats() json.dump( j, open(f"/data/json/journals/{va.name} ({va.venue_id}).json", "w")) except Exception as e: print(e.message) logger.error(e.message)
def create_field_of_study_hierarchy_sframe(): """ Creates field of study hierarchy sframe from txt files """ logger.info("Creating Field of Study Hierarchy SFrame") if os.path.isdir(FIELDS_OF_STUDY_HIERARCHY_SFRAME): return h_sf = tc.SFrame.read_csv(FIELDS_OF_STUDY_HIERARCHY_TXT, header=False, delimiter="\t") h_sf = h_sf.rename({ "X1": "Child field of study ID", "X2": "Child field of study level", "X3": "Parent field of study ID", "X4": "Parent field of study level", "X5": "Confidence" }) h_sf.save(FIELDS_OF_STUDY_HIERARCHY_SFRAME)
def create_papers_fields_of_study(flevels=(0, 1, 2, 3)): """ Create SFrame with each paper fields of study by hierarchical levels :param flevels: list of levels, for each level add the papers fields of study in this level """ logger.info("Creating Papers Fields of Study SFrame") if os.path.isdir(PAPERS_FIELDS_OF_STUDY_SFRAME): return k_sf = tc.load_sframe(KEYWORDS_SFRAME) g = k_sf.groupby('Paper ID', { 'Field of study list': agg.CONCAT("Field of study ID mapped to keyword") }) fh = FieldsHierarchyAnalyzer() # add fileds of study names from ID names = [] for l in g['Field of study list']: names.append([fh.get_field_name(i) for i in l]) g['Field of study list names'] = names for flevel in flevels: logger.info("Adding papers fields of study level %s" % flevel) parent_list = [] for paper_field_of_study_list in g['Field of study list']: parent_list.append( list( set.union(*[ fh.get_parents_field_of_study(field, flevel) for field in paper_field_of_study_list ]))) g['Fields of study parent list (L%s)' % flevel] = parent_list names = [] for paper_field_of_study_parents_list in g[ 'Fields of study parent list (L%s)' % flevel]: names.append([ fh.get_field_name(field_of_study) for field_of_study in paper_field_of_study_parents_list ]) g['Fields of study parent list names (L%s)' % flevel] = names g.save(PAPERS_FIELDS_OF_STUDY_SFRAME)
def create_paper_author_affiliations_sframe(): """ Creating authors affilation SFrame from txt files :return: """ logger.info("Creating Author Affilliations SFrame") if os.path.isdir(PAPER_AUTHOR_AFFILIATIONS_SFRAME): return sf = tc.SFrame.read_csv(PAPER_AUTHOR_AFFILIATIONS_TXT, header=False, delimiter="\t") sf = sf.rename({ "X1": "Paper ID", "X2": "Author ID", "X3": "Affiliation ID", "X4": "Original affiliation name", "X5": "Normalized affiliation name", "X6": "Author sequence number" }) sf.save(PAPER_AUTHOR_AFFILIATIONS_SFRAME)
def _get_author_feature_by_year_sframe(self, feature_name, feature_col_name): """ Create a SFrame with AuthorId and a dict with the author's input feature (feature_name) over the years values :param feature_name: input feature name :param feature_col_name: the Sframe column name which contains dict with the author feature_name values over the years :return: SFrame with AuthorId and feature_col_name columns :rtype: tc.SFrame """ logger.info("Calcualting authors feature %s by year" % feature_name) a_sf = self.paper_author_affiliation_sframe['AuthorId', 'Year', feature_name] a_sf['Feature Year'] = a_sf.apply(lambda r: (int(r["Year"]), r[feature_name])) g = a_sf.groupby("AuthorId", {"Feature List": agg.CONCAT("Feature Year")}) g[feature_col_name] = g["Feature List"].apply( lambda l: _entities_years_list_to_dict(l)) g = g.remove_column("Feature List") return g
def insert_sframe(self, sf, db_name, collection_name, insert_rows_iter=100000, index_cols_list=()): """ Insert the input SFrame into the input DB and collection :param sf: SFrame object :param db_name: DB name :param collection_name: collection names :param insert_rows_iter: how many rows to insert in each iteration :param index_cols_list: list of columns to add index to each element in the list is atuple with the column names and if the column is unique """ rows_num = len(sf) collection = self._client[db_name][collection_name] for i in range(0, rows_num, insert_rows_iter): logger.info("Inserting rows %s - %s to %s.%s" % (i, i + insert_rows_iter, db_name, collection_name)) tmp_sf = sf[i: i + insert_rows_iter] json_list = [r for r in tmp_sf] collection.insert_many(json_list) for i in index_cols_list: self.create_index(db_name, collection_name, i[0], unique=i[1])
def create_papers_authors_lists_sframe(): """ Create SFrame in which each row contains paper id and a sorted list of the paper's authors """ logger.info("Creating Authors Lists SFrame") if os.path.isdir(PAPERS_ORDERED_AUTHORS_LIST_SFRAME): return authors_sf = tc.load_sframe(PAPER_AUTHOR_AFFILIATIONS_SFRAME) authors_sf = authors_sf["Paper ID", "Author ID", "Author sequence number"] authors_sf['Author_Seq'] = authors_sf.apply( lambda r: [r["Author ID"], r["Author sequence number"]]) g = authors_sf.groupby("Paper ID", {"Authors List": agg.CONCAT('Author_Seq')}) g['Authors List Sorted'] = g["Authors List"].apply( lambda l: sorted(l, key=lambda i: i[1])) g['Authors List Sorted'] = g['Authors List Sorted'].apply( lambda l: [i[0] for i in l]) g = g.remove_column("Authors List") g = g["Paper ID", 'Authors List Sorted'] g['Authors Number'] = g['Authors List Sorted'].apply(lambda l: len(l)) g.save(PAPERS_ORDERED_AUTHORS_LIST_SFRAME)
def _create_field_of_study_paper_ids_sframe(level): """ Create SFrame in which each row contains a field of study and it's matching list of paper ids :param level: field of study level :return: SFrame with the fields of stuyd in the input level papers ids :rtype: tc.SFrame """ logger.info("Creating fields os study paper ids SFrame level - %s " % level) col = 'Fields of study parent list (L%s)' % level sf = tc.load_sframe(EXTENDED_PAPERS_SFRAME) new_col_name = "Field ID" sf = sf.stack(col, new_column_name=new_col_name) sf = sf[sf[col] != None] g = sf.groupby(new_col_name, {'Paper IDs': agg.CONCAT("Paper ID")}) f_sf = tc.load_sframe(FIELDS_OF_STUDY_SFRAME) g = g.join(f_sf, on={new_col_name: "Field of study ID"}) g['Number of Paper'] = g['Paper IDs'].apply(lambda l: len(l)) g['Level'] = level g = g.rename({new_col_name: "Field of study ID"}) return g
def create_extended_papers_sframe(): """ Created extended papers SFrame which contains various papers features, such as paper citation numbers, authors list, urls,.. etc :return: """ logger.info("Creating Extended Papers SFrame") if os.path.isdir(EXTENDED_PAPERS_SFRAME): return sf = tc.load_sframe(PAPERS_SFRAME) sframes_list = [ PAPER_REFERENCES_COUNT_SFRAME, PAPERS_CITATIONS_BYYEAR_SFRAME, PAPERS_ORDERED_AUTHORS_LIST_SFRAME, PAPER_KEYWORDS_LIST_SFRAME, PAPERS_FIELDS_OF_STUDY_SFRAME, PAPER_URLS_SFRAME ] for s in sframes_list: t = tc.load_sframe(s) sf = sf.join(t, how="left", on="Paper ID") sf.save(EXTENDED_PAPERS_SFRAME) sf = sf.fillna("Ref Number", 0) sf.save(EXTENDED_PAPERS_SFRAME)
def get_co_authors_dict_sframe(self): """ Create SFrame with each author's coauthors by year :return: SFrame with AuthorId and Coauthors by Years Dict :note: the function can take considerable amount of time to execute """ logger.info("Calcualting authors' coauthors by year") sf = self.paper_authors_years sf = sf.join(sf, on='PaperId') sf = sf[sf['AuthorId'] != sf['AuthorId.1']] sf = sf.remove_column('Year.1') sf = sf.groupby(['AuthorId', 'Year'], {'Coauthors List': agg.CONCAT('AuthorId.1')}) sf['Coauthors Year'] = sf.apply(lambda r: (r['Year'], r['Coauthors List'])) sf = sf.groupby("AuthorId", {'Coauthors list': agg.CONCAT('Coauthors Year')}) sf['Coauthors by Years Dict'] = sf['Coauthors list'].apply( lambda l: {y: coa_list for y, coa_list in l}) sf = sf.remove_column('Coauthors list') return sf
def get_sjr_dict(self, venue_name, issn_list=()): """ Get's the venue SJR data from venue name's or ISSN values :param venue_name: venue names :param issn_list: issn values list (optional) :return: list of the matching ISSN journals from the SJR dataset :rtype: list<dict> :noteo: isssn values in SJR dataset are 8 digits """ logger.info( f"Get SJR data of venue_name={venue_name}, issn_list={issn_list}") sjr_data = {} l = [j for j in self._sjr_collection.find({"Title": venue_name})] for issn in issn_list: issn = issn.replace('-', '') l += [j for j in self._sjr_collection.find({"ISSN": issn})] for j in l: if j["ISSN"] not in sjr_data: sjr_data[j["ISSN"]] = [] sjr_data[j["ISSN"]].append(j) return sjr_data
def create_extended_references_sframe(): """ Create SFrame with references data with additional column that state if the reference is self-citation """ logger.info("Creating Extended References SFrame") if os.path.isdir(EXTENDED_PAPER_REFERENCES_SFRAME): return ref_sf = tc.load_sframe(PAPER_REFERENCES_SFRAME) p_sf = tc.load_sframe(PAPERS_ORDERED_AUTHORS_LIST_SFRAME) ref_sf = ref_sf.join(p_sf, on='Paper ID', how="left") ref_sf = ref_sf.join(p_sf, on={'Paper reference ID': 'Paper ID'}, how="left") ref_sf = ref_sf.fillna('Authors List Sorted.1', []) ref_sf = ref_sf.fillna('Authors List Sorted', []) ref_sf.__materialize__() ref_sf['self citation'] = ref_sf.apply(lambda r: len( set(r['Authors List Sorted.1']) & set(r['Authors List Sorted']))) ref_sf.__materialize__() ref_sf = ref_sf.remove_columns( ['Authors List Sorted.1', 'Authors List Sorted']) ref_sf.save(EXTENDED_PAPER_REFERENCES_SFRAME)
def create_papers_sframe(): """ Create the Papers SFrame object from txt files which contains information on each paper """ logger.info("Creating Papers SFrame") if os.path.isdir(PAPERS_SFRAME): return sf = tc.SFrame.read_csv(PAPERS_TXT, header=False, delimiter="\t") sf = sf.rename({ "X1": "Paper ID", "X2": "Original paper title", "X3": "Normalized paper title", "X4": "Paper publish year", "X5": "Paper publish date", "X6": "Paper Document Object Identifier (DOI)", "X7": "", "X8": "Normalized venue name", "X9": "Journal ID mapped to venue name", "X10": "Conference ID mapped to venue name", "X11": "Paper rank" }) sf["Paper publish year"] = sf["Paper publish year"].astype(int) sf.save(PAPERS_SFRAME)
def load_sframes(mag, sjr, joined): # from ScienceDynamics.config.configs import DATASETS_BASE_DIR # mag = MicrosoftAcademicGraph(DATASETS_BASE_DIR / "MicrosoftAcademicGraph.zip") """ Load the journals/authors sframes to Mongo """ logger.info("Loading authors features") md = MongoDBConnector() a = AuthorsFeaturesExtractor(mag) sf = a.authors_features logger.info("Converting") sf = _convert_sframe_dict_key_to_str(sf, [c for c in sf.column_names() if "Year" in c]) sf['Sequence Number by Year Dict'] = sf['Sequence Number by Year Dict'].apply( lambda d: {k: [str(int(float(i))) for i in v] for k, v in d.items()}) sf.materialize() index_list = [('Author ID', True), ('Author name', False)] md.insert_sframe(sf, 'journals', 'authors_features', index_cols_list=index_list) logger.info("Loading papers features") sf = mag.extended_papers index_list = [('OriginalVenue', False), ('PaperId', True), ('ConferenceSeriesId', False), ('ConferenceInstanceId', False), ('JournalId', False)] md.insert_sframe(sf, 'journals', 'papers_features', index_cols_list=index_list) logger.info("Loading SJR features") sf = sjr.data sf = sf.rename({c: c.replace(".", "") for c in sf.column_names()}) sf['Title'] = sf['Title'].apply(lambda t: t.encode('utf-8')) index_list = [('Title', False), ('ISSN', False)] md.insert_sframe(sf, 'journals', 'sjr_journals', index_cols_list=index_list) sf = joined.aminer_mag_links_by_doi sf = sf.rename({c: c.replace(".", "") for c in sf.column_names()}) index_list = [('OriginalVenue', False), ('MAG Paper ID', True), ('Conference ID mapped to venue name', False), ('Journal ID mapped to venue name', False), ('issn', False)] md.insert_sframe(sf, 'journals', 'aminer_mag_papers', index_cols_list=index_list)
def get_papers_sframe(min_ref_num=None, start_year=None, end_year=None): """ Return SFrame with Papers data accoring to the input filter variables :param min_ref_num: paper's minimal references number :param start_year: start year (only include paper that were published after start year) :param end_year: end year (only include paper that were published before end year) :return: SFrame with paper data :rtype: tc.SFrame :note: after the SFrame is created it is saved to the TMP_DIR to future use """ sf = tc.load_sframe(PAPER_REFERENCES_SFRAME) tmp_papers_sf_path = _get_tmp_papers_sframe_path(min_ref_num, start_year, end_year) if os.path.isdir(tmp_papers_sf_path): return tc.load_sframe(tmp_papers_sf_path) if min_ref_num is not None: logger.info( f"Getting papers ids with at least refrences {min_ref_num}") sf = sf.groupby( 'Paper ID', {'Ref Count': agg.COUNT()}) # There are 30058322 in the list sf = sf[sf['Ref Count'] >= min_ref_num] # left with 22,083,058 sf.__materialize__() p_sf = tc.load_sframe(PAPERS_SFRAME) sf = p_sf.join(sf) if start_year is not None: logger.info("Getting papers with from %s " % start_year) sf = sf[sf['Paper publish year'] >= start_year] if end_year is not None: logger.info("Getting papers with util %s " % end_year) sf = sf[sf['Paper publish year'] <= end_year] sf.__materialize__() if not os.path.isdir(tmp_papers_sf_path): sf.save(tmp_papers_sf_path) return sf