Пример #1
0
def test_check_db():
    """Test the check_db function."""

    # Check that it returns an SCDB when given None
    db = check_db(None)
    assert isinstance(db, SCDB)

    # Check that it returns an SCDB object when given one
    db = SCDB()
    db = check_db(db)
    assert isinstance(db, SCDB)
    def save_summary(self, db=None):
        """Save out a summary of the scraped term paper data."""

        db = check_db(db)

        with open(db.words_path + '/summary/' + self.label + '.json', 'w') as outfile:
            json.dump(self.summary, outfile)
Пример #3
0
def plot_years(year_counts, label, disp_fig=True, save_fig=False, db=None):
    """Plot publications across years histogram."""

    f, ax = plt.subplots(figsize=(10, 5))

    yrs = set(range(1985, 2016))

    # Extract x & y data to plot
    x_dat = [y[0] for y in year_counts]
    y_dat = [y[1] for y in year_counts]

    # Add line and points to plot
    plt.plot(x_dat, y_dat)
    plt.plot(x_dat, y_dat, '.', markersize=16)

    # Set plot limits
    plt.xlim([min(yrs), max(yrs)])
    plt.ylim([0, max(y_dat)+5])

    # Add title & labels
    plt.title('Publication History', fontsize=24, fontweight='bold')
    plt.xlabel('Year', fontsize=18)
    plt.ylabel('# Pubs', fontsize=18)

    if save_fig:

        db = check_db(db)
        s_file = os.path.join(db.figs_path, 'year', label + '.svg')

        plt.savefig(s_file, transparent=True)
        if not disp_fig:
            plt.close()
def make_wc(freq_dist, n_words, label, disp_fig=True, save_fig=False, db=None):
    """Create and display wordcloud.

    Parameters
    ----------
    n_words : int
        Number of top words to include in the wordcloud.
    save_fig : boolean
        Whether to save out the wordcloud.
    """

    wc = create_wc(conv_freqs(freq_dist, 20))

    plt.figure(figsize=(10, 10))
    plt.imshow(wc)
    plt.axis("off")

    if save_fig:

        db = check_db(db)
        s_file = os.path.join(db.figs_path, 'wc', label + '.svg')

        plt.savefig(s_file, transparent=True)
        if not disp_fig:
            plt.close()
def load_pickle_obj(f_name, db=None):
    """Load a custom object, from a pickle file, for SCANR project.

    Parameters
    ----------
    f_name : str
        File name of the object to be loaded.
    db : SCDB object, optional
        Database object for the SCANR project.
    """

    # Check for database object, initialize if not provided
    db = check_db(db)

    # Get all available files, for Count and Words pickled objects
    counts_objs = os.listdir(db.counts_path)
    words_objs = os.listdir(db.words_path)

    # Search for object in saved Count files, and set path if found
    if f_name + '.p' in counts_objs:
        load_path = os.path.join(db.counts_path, f_name + '.p')

    # Search for object in saved Words files, and set path if found
    elif f_name + '.p' in words_objs:
        load_path = os.path.join(db.words_path, f_name + '.p')

    # Raise an error if the file name is not found
    else:
        raise InconsistentDataError('Can not find requested file name.')

    # Load and return the data
    return pickle.load(open(load_path, 'rb'))
Пример #6
0
def make_wc(freq_dist, n_words, label, disp_fig=True, save_fig=False, db=None):
    """Create and display wordcloud.

    Parameters
    ----------
    n_words : int
        Number of top words to include in the wordcloud.
    save_fig : boolean
        Whether to save out the wordcloud.
    """

    wc = create_wc(conv_freqs(freq_dist, 20))

    plt.figure(figsize=(10, 10))
    plt.imshow(wc)
    plt.axis("off")

    if save_fig:

        db = check_db(db)
        s_file = os.path.join(db.figs_path, 'wc', label + '.svg')

        plt.savefig(s_file, transparent=True)
        if not disp_fig:
            plt.close()
Пример #7
0
def load_pickle_obj(f_name, db=None):
    """Load a custom object, from a pickle file, for SCANR project.

    Parameters
    ----------
    f_name : str
        File name of the object to be loaded.
    db : SCDB object, optional
        Database object for the SCANR project.
    """

    # Check for database object, initialize if not provided
    db = check_db(db)

    # Get all available files, for Count and Words pickled objects
    counts_objs = os.listdir(db.counts_path)
    words_objs = os.listdir(db.words_path)

    # Search for object in saved Count files, and set path if found
    if f_name + '.p' in counts_objs:
        load_path = os.path.join(db.counts_path, f_name + '.p')

    # Search for object in saved Words files, and set path if found
    elif f_name + '.p' in words_objs:
        load_path = os.path.join(db.words_path, f_name + '.p')

    # Raise an error if the file name is not found
    else:
        raise InconsistentDataError('Can not find requested file name.')

    # Load and return the data
    return pickle.load(open(load_path, 'rb'))
def plot_clustermap(dat, cmap='purple', save_fig=False, save_name='Clustermap'):
    """Plot clustermap.

    Parameters
    ----------
    dat : pandas.DataFrame
        Data to create clustermap from.
    """

    # Set up plotting and aesthetics
    sns.set()
    sns.set_context("paper", font_scale=1.5)

    # Set colourmap
    if cmap == 'purple':
        cmap = sns.cubehelix_palette(as_cmap=True)
    elif cmap == 'blue':
        cmap = sns.cubehelix_palette(as_cmap=True, rot=-.3, light=0.9, dark=0.2)

    # Create the clustermap
    cg = sns.clustermap(dat, cmap=cmap, method='complete', metric='cosine', figsize=(12, 10))

    # Fix axes
    cg.cax.set_visible(True)
    _ = plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=60, ha='right')
    _ = plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)

    # Save out - if requested
    if save_fig:

        db = check_db(db)
        s_file = os.path.join(db.figs_path, save_name + '.svg')

        cg.savefig(s_file, transparent=True)
Пример #9
0
    def save_summary(self, db=None):
        """Save out a summary of the scraped term paper data."""

        db = check_db(db)

        with open(db.words_path + '/summary/' + self.label + '.json',
                  'w') as outfile:
            json.dump(self.summary, outfile)
    def save(self, db=None):
        """Save out json file with all attached data."""

        db = check_db(db)

        with open(db.words_path + '/raw/' + self.label + '.json', 'w') as outfile:
            for art in self:
                json.dump(art, outfile)
                outfile.write('\n')

        # Update history
        self.update_history('Saved')
Пример #11
0
    def save(self, db=None):
        """Save out json file with all attached data."""

        db = check_db(db)

        with open(db.words_path + '/raw/' + self.label + '.json',
                  'w') as outfile:
            for art in self:
                json.dump(art, outfile)
                outfile.write('\n')

        # Update history
        self.update_history('Saved')
def plot_matrix(dat, x_labels, y_labels, square=False, figsize=(10, 12), save_fig=False, save_name='Matrix'):
    """Plot the matrix of percent asscociations between terms."""

    f, ax = plt.subplots(figsize=figsize)

    sns.heatmap(dat, square=square, xticklabels=x_labels, yticklabels=y_labels)

    f.tight_layout()

    # Save out - if requested
    if save_fig:

        db = check_db(db)
        s_file = os.path.join(db.figs_path, save_name + '.svg')

        plt.savefig(s_file)
Пример #13
0
def plot_clustermap(dat,
                    cmap='purple',
                    save_fig=False,
                    save_name='Clustermap'):
    """Plot clustermap.

    Parameters
    ----------
    dat : pandas.DataFrame
        Data to create clustermap from.
    """

    # Set up plotting and aesthetics
    sns.set()
    sns.set_context("paper", font_scale=1.5)

    # Set colourmap
    if cmap == 'purple':
        cmap = sns.cubehelix_palette(as_cmap=True)
    elif cmap == 'blue':
        cmap = sns.cubehelix_palette(as_cmap=True,
                                     rot=-.3,
                                     light=0.9,
                                     dark=0.2)

    # Create the clustermap
    cg = sns.clustermap(dat,
                        cmap=cmap,
                        method='complete',
                        metric='cosine',
                        figsize=(12, 10))

    # Fix axes
    cg.cax.set_visible(True)
    _ = plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(),
                 rotation=60,
                 ha='right')
    _ = plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)

    # Save out - if requested
    if save_fig:

        db = check_db(db)
        s_file = os.path.join(db.figs_path, save_name + '.svg')

        cg.savefig(s_file, transparent=True)
def plot_dendrogram(dat, labels, save_fig=False, save_name='Dendrogram'):
    """Plot dendrogram."""

    plt.figure(figsize=(3, 15))

    Y = hier.linkage(dat, method='complete', metric='cosine')

    Z = hier.dendrogram(Y, orientation='left', labels=labels,
                        color_threshold=0.25, leaf_font_size=12)

    # Save out - if requested
    if save_fig:

        db = check_db(db)
        s_file = os.path.join(db.figs_path, save_name + '.svg')

        cg.savefig(s_file, transparent=True)
Пример #15
0
    def load(self, db=None):
        """Load raw data from json file."""

        db = check_db(db)

        data = _parse_json_dat(db.words_path + '/raw/' + self.label + '.json')

        for dat in data:
            self.add_id(dat['id'])
            self.add_title(dat['title'])
            self.add_journal(dat['journal'][0], dat['journal'][1])
            self.add_authors(dat['authors'])
            self.add_words(dat['words'])
            self.add_kws(dat['kws'])
            self.add_pub_date([dat['year'], dat['month']])
            self.add_doi(dat['doi'])
            self.increment_n_articles()

        self.check_results()
    def load(self, db=None):
        """Load raw data from json file."""

        db = check_db(db)

        data = _parse_json_dat(db.words_path + '/raw/' + self.label + '.json')

        for dat in data:
            self.add_id(dat['id'])
            self.add_title(dat['title'])
            self.add_journal(dat['journal'][0], dat['journal'][1])
            self.add_authors(dat['authors'])
            self.add_words(dat['words'])
            self.add_kws(dat['kws'])
            self.add_pub_date([dat['year'], dat['month']])
            self.add_doi(dat['doi'])
            self.increment_n_articles()

        self.check_results()
Пример #17
0
def plot_dendrogram(dat, labels, save_fig=False, save_name='Dendrogram'):
    """Plot dendrogram."""

    plt.figure(figsize=(3, 15))

    Y = hier.linkage(dat, method='complete', metric='cosine')

    Z = hier.dendrogram(Y,
                        orientation='left',
                        labels=labels,
                        color_threshold=0.25,
                        leaf_font_size=12)

    # Save out - if requested
    if save_fig:

        db = check_db(db)
        s_file = os.path.join(db.figs_path, save_name + '.svg')

        cg.savefig(s_file, transparent=True)
Пример #18
0
def plot_matrix(dat,
                x_labels,
                y_labels,
                square=False,
                figsize=(10, 12),
                save_fig=False,
                save_name='Matrix'):
    """Plot the matrix of percent asscociations between terms."""

    f, ax = plt.subplots(figsize=figsize)

    sns.heatmap(dat, square=square, xticklabels=x_labels, yticklabels=y_labels)

    f.tight_layout()

    # Save out - if requested
    if save_fig:

        db = check_db(db)
        s_file = os.path.join(db.figs_path, save_name + '.svg')

        plt.savefig(s_file)
def save_pickle_obj(obj, f_name, db=None):
    """Save a custom object from LISC as a pickle file.

    Parameters
    ----------
    obj : {Counts() object, Words() object}
        LISC custom object to save out.
    f_name : str
        Name to append to saved out file name.
    db : SCDB() object, optional
        Database object for the LISC project.
    """

    # Check for database object, initialize if not provided
    db = check_db(db)

    # If it's a Counts object, set path and name
    if isinstance(obj, Count):
        save_name = f_name + '_counts.p'
        save_path = db.counts_path

    # If it's a Words object, set path and name
    elif isinstance(obj, Words):
        save_name = f_name + '_words.p'
        save_path = db.words_path

    # If neither, raise error as object type is unclear
    else:
        raise InconsistentDataError('Object type unclear - can not save.')

    # Save out labels header file
    #with open(os.path.join(save_path, 'labels.txt'), 'w') as outfile:
    #    for label in obj.labels:
    #        outfile.write("%s\n" % label)

    # Save pickle file
    save_file = os.path.join(save_path, save_name)
    pickle.dump(obj, open(save_file, 'wb'))
Пример #20
0
def save_pickle_obj(obj, f_name, db=None):
    """Save a custom object from LISC as a pickle file.

    Parameters
    ----------
    obj : {Counts() object, Words() object}
        LISC custom object to save out.
    f_name : str
        Name to append to saved out file name.
    db : SCDB() object, optional
        Database object for the LISC project.
    """

    # Check for database object, initialize if not provided
    db = check_db(db)

    # If it's a Counts object, set path and name
    if isinstance(obj, Count):
        save_name = f_name + '_counts.p'
        save_path = db.counts_path

    # If it's a Words object, set path and name
    elif isinstance(obj, Words):
        save_name = f_name + '_words.p'
        save_path = db.words_path

    # If neither, raise error as object type is unclear
    else:
        raise InconsistentDataError('Object type unclear - can not save.')

    # Save out labels header file
    #with open(os.path.join(save_path, 'labels.txt'), 'w') as outfile:
    #    for label in obj.labels:
    #        outfile.write("%s\n" % label)

    # Save pickle file
    save_file = os.path.join(save_path, save_name)
    pickle.dump(obj, open(save_file, 'wb'))