Пример #1
0
 def setUp(self) -> None:
     self.corpus = Elsevier_Corpus_Reader.ScopusProcessedCorpusReader(
         "Corpus/Processed_corpus/")
     self.loader = Elsevier_Corpus_Reader.CorpuKfoldLoader(self.corpus,
                                                           n_folds=12,
                                                           shuffle=False)
     self.subset = next(self.loader.fileids(test=True))
 def setUp(self) -> None:
     self.corpus = Elsevier_Corpus_Reader.ScopusProcessedCorpusReader(
         "Corpus/Processed_corpus/")
     self.loader = Elsevier_Corpus_Reader.CorpuKfoldLoader(self.corpus,
                                                           n_folds=12,
                                                           shuffle=False)
     self.subset = next(self.loader.fileids(test=True))
     self.model = Pipeline([("norm", Corpus_Vectorizer.TitleNormalizer()),
                            ("vect", Corpus_Vectorizer.OneHotVectorizer()),
                            ('clusters',
                             Corpus_Cluster.HierarchicalClustering())])
def process_corpus():
    corp = Elsevier_Corpus_Reader.ScopusRawCorpusReader(
            "Corpus/Processed_corpus/")

    formatter = Elsivier_Corpus_Pre_Processor.PickledCorpusPreProcessor(corp)

    formatter.transform()
    def __init__(self, path):
        """
        Initialise the author network
        Parameters
        ----------
        path : string like
            path to corpus

        """
        self.path = path
        self.corpus = Elsevier_Corpus_Reader.ScopusRawCorpusReader(path)
Пример #5
0
def document_feature_counter(path,
                             feature='pub_date',
                             sort=False,
                             how='count',
                             **kwargs) -> dict:
    """
    utility for counting the number of instances observed for a given feature
    in the document meta data
    Parameters
    ----------
    path: str
        path to the corpus
    feature: str
        feature to be counted
            'pub_date' - date of publication
            'pub_type' - type of document, eg. Article, Review ...
            'publication' - journal in which the document is published
    sort: bool
        should the output dictionary be sorted or not
    how: str
        if the output should be sorted, how should it be sorted
        'class' - sorted by the class, requires a sortable class, eg. dates
        'count' - sorted by the number of counts of a class
    kwargs:
        optional arguments that can be piped through to an underlying corpus
        reader method.

    Returns
    -------
        dict like object, either a Counter object or an OrderedDict
    """
    corp = Elsevier_Corpus_Reader.ScopusRawCorpusReader(path)
    feature_map = {'pub_date': corp.pub_date,
                   # 'pub_type': corp.pub_type,
                   'publication': corp.publication,
                   'author_count': corp.author_count}
    sort_how_map = {'class': 0,
                    'count': 1}
    if kwargs:
        data = Counter(feature_map[feature](**kwargs))
    else:
        data = Counter(feature_map[feature]())
    if not sort:
        return data
    else:
        sorted_data = sorted(data.items(), key=lambda kv: kv[sort_how_map[
            how]])
        return OrderedDict(sorted_data)
def plot_clusters(X, y, **kwargs) -> None:

    fig, ax = plt.subplots(figsize=(10, 5))
    ax = sns.scatterplot(x=X[:,0], y=X[:,1], hue=y)
    plt.tight_layout()
    plt.show()


if __name__ == '__main__':
    from CorpusReader import Elsevier_Corpus_Reader
    from CorpusProcessingTools import Corpus_Vectorizer
    from CorpusProcessingTools import Corpus_Cluster


    corpus = Elsevier_Corpus_Reader.ScopusProcessedCorpusReader(
        "Corpus/Processed_corpus/")

    loader = Elsevier_Corpus_Reader.CorpuKfoldLoader(corpus, 100, shuffle=False)
    subset = next(loader.fileids(test=True))

    docs = list(corpus.title_tagged(fileids=subset))

    # # Plot hierarchical clustering
    # model = Pipeline([
    #     ("norm", Corpus_Vectorizer.TitleNormalizer()),
    #     ("vect", Corpus_Vectorizer.OneHotVectorizer()),
    #     ('clusters', Corpus_Cluster.HierarchicalClustering())
    # ])
    #
    # clusters = model.fit_transform(docs)
    # labels = model.named_steps['clusters'].labels
    formatter = Elsivier_Corpus_Pre_Processor.PickledCorpusPreProcessor(corp)

    formatter.transform()


def plot_features():
    AN = Author_Networks.AuthorNetworks("Corpus/Processed_corpus/")
    # AN.plot_co_author_network(categories='soft robot/2000')
    AN.co_author_network_bokeh_better(categories=['soft robot/2000',
                                                  'soft robot/2001',
                                                  'soft robot/2002'])


if __name__ == '__main__':
    # step 1: download the raw corpus from elsivier
    # download_corpus()

    # step 2: reformat the corpus for faster manipulation
    # reformat_corpus()

    # step 3: reformat the corpus for faster manipulation
    process_corpus()

    # step 4: load the corpus reader
    corp = Elsevier_Corpus_Reader.ScopusProcessedCorpusReader(
        "Corpus/Processed_corpus/")

    # step 5: plot author connectivity
    # plot_features()
 def setUp(self) -> None:
     self.corp = Elsevier_Corpus_Reader.ScopusRawCorpusReader(
         "Corpus/Processed_corpus/")
 def setUp(self) -> None:
     self.corp = Elsevier_Corpus_Reader.ScopusProcessedCorpusReader(
         "Corpus/Processed_corpus/")
     self.loader = Elsevier_Corpus_Reader.CorpuKfoldLoader(self.corp,
                                                           n_folds=12,
                                                           shuffle=False)