def load_publicationauthor(self, preprocess=True, columns=None, isindict=None, duplicate_subset=None, duplicate_keep='last', dropna=None, show_progress=False): """ Load the PublicationAuthor DataFrame from a preprocessed directory. For DBLP, you must run preprocess before the dataframe is available for use. Parameters ---------- preprocess : bool, default True, Optional Attempt to load from the preprocessed directory. columns : list, default None, Optional Load only this subset of columns isindict : dict, default None, Optional Dictionary of format {"ColumnName":"ListofValues"} where "ColumnName" is a data column and "ListofValues" is a sorted list of valid values. A DataFrame only containing rows that appear in "ListofValues" will be returned. duplicate_subset : list, default None, Optional Drop any duplicate entries as specified by this subset of columns duplicate_keep : str, default 'last', Optional If duplicates are being dropped, keep the 'first' or 'last' (see `pandas.DataFram.drop_duplicates <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html>`_) dropna : list, default None, Optional Drop any NaN entries as specified by this subset of columns Returns ------- DataFrame PublicationAuthor DataFrame. """ if show_progress: show_progress = 'Loading PublicationAuthor' if preprocess and os.path.exists( os.path.join(self.path2database, 'publicationauthor')): return load_preprocessed_data('publicationauthor', path2database=self.path2database, columns=columns, isindict=isindict, duplicate_subset=duplicate_subset, duplicate_keep=duplicate_keep, dropna=dropna, show_progress=show_progress) else: raise NotImplementedError( "DBLP is stored as a single xml file. Run preprocess to parse the file." )
def load_impact(self, preprocess = True, include_yearnormed = True, columns = None, isindict = None, duplicate_subset = None, duplicate_keep = 'last', dropna = None, prefunc2apply=None, postfunc2apply=None, show_progress=False): """ Load the precomputed impact DataFrame from a preprocessed directory. Parameters ---------- :param preprocess : bool, default True Attempt to load from the preprocessed directory. :param include_yearnormed: bool, default True Normalize all columns by yearly average. :param columns : list, default None Load only this subset of columns :param isindict : dict, default None, Optional Dictionary of format {"ColumnName":"ListofValues"} where "ColumnName" is a data column and "ListofValues" is a sorted list of valid values. A DataFrame only containing rows that appear in "ListofValues" will be returned. :param duplicate_subset : list, default None, Optional Drop any duplicate entries as specified by this subset of columns :param duplicate_keep : str, default 'last', Optional If duplicates are being dropped, keep the 'first' or 'last' (see `pandas.DataFram.drop_duplicates <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html>`_) :param dropna : list, default None, Optional Drop any NaN entries as specified by this subset of columns Returns ------- DataFrame FieldInformation DataFrame. """ if show_progress: show_progress='Loading Impact' if include_yearnormed: def normfunc(impactdf): impactcolumns = [c for c in list(impactdf) if not c in ['PublicationId', 'Year']] for c in impactcolumns: impactdf[c+'_norm'] = impactdf[c]/impactdf[c].mean() return impactdf else: def normfunc(impactdf): return impactdf if preprocess and os.path.exists(os.path.join(self.path2database, 'impact')): return load_preprocessed_data('impact', path2database=self.path2database, columns=columns, isindict=isindict, duplicate_subset=duplicate_subset, duplicate_keep=duplicate_keep, dropna=dropna, prefunc2apply=normfunc, show_progress=show_progress) else: raise self.compute_impact()
def load_references(self, preprocess = True, columns = None, isindict = None, duplicate_subset = None, duplicate_keep = 'last', noselfcite = False, dropna = None, prefunc2apply=None, postfunc2apply=None, show_progress=False): """ Load the Pub2Ref DataFrame from a preprocessed directory, or parse from the raw files. Parameters ---------- preprocess : bool, default True, Optional Attempt to load from the preprocessed directory. columns : list, default None, Optional Load only this subset of columns isindict : dict, default None, Optional Dictionary of format {"ColumnName":"ListofValues"} where "ColumnName" is a data column and "ListofValues" is a sorted list of valid values. A DataFrame only containing rows that appear in "ListofValues" will be returned. duplicate_subset : list, default None, Optional Drop any duplicate entries as specified by this subset of columns duplicate_keep : str, default 'last', Optional If duplicates are being dropped, keep the 'first' or 'last' (see `pandas.DataFram.drop_duplicates <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html>`_) dropna : list, default None, Optional Drop any NaN entries as specified by this subset of columns noselfcite : bool, default False, Optional If True, then the preprocessed pub2ref files with self-citations removed will be used. Returns ------- DataFrame Pub2Ref DataFrame. """ if noselfcite: fileprefix = 'pub2refnoself' else: fileprefix = 'pub2ref' if show_progress: show_progress='Loading {}'.format(fileprefix) if preprocess and os.path.exists(os.path.join(self.path2database, fileprefix)): return load_preprocessed_data(fileprefix, path2database=self.path2database, columns=columns, isindict=isindict, duplicate_subset=duplicate_subset, duplicate_keep=duplicate_keep, dropna=dropna, prefunc2apply=prefunc2apply, postfunc2apply=postfunc2apply, show_progress=show_progress) else: return self.parse_references()
def load_authors(self, preprocess = True, columns = None, isindict = None, duplicate_subset = None, duplicate_keep = 'last', dropna = None, prefunc2apply=None, postfunc2apply=None, process_name = True, show_progress=True): """ Load the Author DataFrame from a preprocessed directory, or parse from the raw files. Parameters ---------- preprocess : bool, default True, Optional Attempt to load from the preprocessed directory. columns : list, default None, Optional Load only this subset of columns isindict : dict, default None, Optional Dictionary of format {"ColumnName":"ListofValues"} where "ColumnName" is a data column and "ListofValues" is a sorted list of valid values. A DataFrame only containing rows that appear in "ListofValues" will be returned. duplicate_subset : list, default None, Optional Drop any duplicate entries as specified by this subset of columns duplicate_keep : str, default 'last', Optional If duplicates are being dropped, keep the 'first' or 'last' (see `pandas.DataFram.drop_duplicates <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html>`_) dropna : list, default None, Optional Drop any NaN entries as specified by this subset of columns process_name : bool, default True, Optional If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_ will be used to split author FullNames. Returns ------- DataFrame Author DataFrame. """ if show_progress: show_progress='Loading Authors' if preprocess and os.path.exists(os.path.join(self.path2database, 'author')): return load_preprocessed_data('author', path2database=self.path2database, columns=columns, isindict=isindict, duplicate_subset=duplicate_subset, duplicate_keep=duplicate_keep, dropna=dropna, prefunc2apply=prefunc2apply, postfunc2apply=postfunc2apply, show_progress=show_progress) else: return self.parse_authors(process_name=process_name)
def load_pub2field(self, preprocess = True, columns = None, isindict = None, duplicate_subset = None, duplicate_keep = 'last', dropna = None, prefunc2apply=None, postfunc2apply=None, show_progress=False): """ Load the Pub2Field DataFrame from a preprocessed directory, or parse from the raw files. Parameters ---------- :param preprocess : bool, default True, Optional Attempt to load from the preprocessed directory. :param columns : list, default None, Optional Load only this subset of columns :param isindict : dict, default None, Optional Dictionary of format {"ColumnName":"ListofValues"} where "ColumnName" is a data column and "ListofValues" is a sorted list of valid values. A DataFrame only containing rows that appear in "ListofValues" will be returned. :param duplicate_subset : list, default None, Optional Drop any duplicate entries as specified by this subset of columns :param duplicate_keep : str, default 'last', Optional If duplicates are being dropped, keep the 'first' or 'last' (see `pandas.DataFram.drop_duplicates <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html>`_) :param dropna : list, default None, Optional Drop any NaN entries as specified by this subset of columns Returns ------- DataFrame Pub2Field DataFrame. """ if show_progress: show_progress='Loading Fields' if preprocess and os.path.exists(os.path.join(self.path2database, 'pub2field')): return load_preprocessed_data('pub2field', path2database=self.path2database, columns=columns, isindict=isindict, duplicate_subset=duplicate_subset, duplicate_keep=duplicate_keep, dropna=dropna, prefunc2apply=prefunc2apply, postfunc2apply=postfunc2apply, show_progress=show_progress) else: return self.parse_fields()