Exemplo n.º 1
0
def empty_post():
    "mock a post"
    post = utils.create_objdict()
    post.md = ""
    post.html = ""
    post.meta = utils.create_objdict()
    post.meta.statistics = utils.create_objdict()
    post.meta.toc = utils.create_objdict()
    post.elements = utils.create_objdict()
    return post
Exemplo n.º 2
0
def compute_stats(doc):
    ts = TextStats(doc)
    stats = create_objdict()
    counts = {'sentences': ts.n_sents,
              'words': ts.n_words,
              'unique_words': ts.n_unique_words,
              'chars': ts.n_chars,
              'chars_per_word': ts.n_chars_per_word,
              'long_words': ts.n_long_words,
              'syllables': ts.n_syllables,
              'syllables_per_word': ts.n_syllables_per_word,
              'monosyllable_words': ts.n_monosyllable_words,
              'polysyllable_words': ts.n_polysyllable_words
              }
    stats.counts = dict_to_objdict(counts)
    readability = {}
    if stats.counts.words > 0:
        readability = {'flesch_kincaid_grade_level': ts.flesch_kincaid_grade_level,
                       'flesch_reading_ease': ts.flesch_reading_ease,
                       'smog_index': 0,
                       'gunning_fog_index': ts.gunning_fog_index,
                       'coleman_liau_index': ts.coleman_liau_index,
                       'automated_readability_index': ts.automated_readability_index,
                       'lix': ts.lix,
                       }
    if stats.counts.sentences >= 30:
        readability['smog_index'] = ts.smog_index
    stats.readability = dict_to_objdict(readability)
    return stats
Exemplo n.º 3
0
    def process(self, unused, site, config):

        try:
            num_related_posts = config.num_related_posts
            # Tokenize
            docs = []
            valid_posts = []  #exclude pages that are not posts
            for post in site.posts:
                if post.meta.microdata_type not in RelatedPosts.VALID_FORMAT:
                    continue
                txt = post.md
                docs.append(
                    gensim.utils.simple_preprocess(txt,
                                                   deacc=True,
                                                   min_len=3,
                                                   max_len=15))
                valid_posts.append(post)
                # Fixme stemming

            # build model
            dictionary = corpora.Dictionary(docs)
            corpus = [dictionary.doc2bow(doc) for doc in docs]
            tfidf = models.tfidfmodel.TfidfModel(corpus=corpus)
            # Fixme: get correct number of topics
            num_topics = len(
                site.posts
            ) / 5  # use the number of post as proxy for number of topics
            topic_model = models.LsiModel(tfidf[corpus],
                                          id2word=dictionary,
                                          num_topics=num_topics)
            index = similarities.MatrixSimilarity(
                topic_model[tfidf[corpus]], num_best=num_related_posts +
                1)  #+1 because the best one is itself

            # find simlar posts and store them
            log_details = ""
            for post, sims in zip(valid_posts, index):
                if post.meta.microdata_type not in RelatedPosts.VALID_FORMAT:
                    continue
                post.meta.related_posts = []
                log_details += '<div class="subsection"><h3>%s</h3>Related posts:<ol>' % (
                    post.meta.title)
                for idx, score in sims[
                        1:]:  #1: > first one is the article itself
                    p = valid_posts[idx]
                    o = utils.create_objdict()
                    o.meta = p.meta
                    o.score = score
                    o.html = p.score
                    post.meta.related_posts.append(o)
                    log_details += '<li>%s (%s)</li>' % (o.meta.title,
                                                         round(score, 2))
                log_details += '<ol></div>'
            return (SiteFab.OK, "Related posts via LSI", log_details)
        except Exception as e:
            return (SiteFab.ERROR, "Related posts via LSI", e)
Exemplo n.º 4
0
def test_objdict_to_dict():
    od = utils.create_objdict()
    od.str = "str"
    od.int = 1
    od.array = [1, 2, 3]
    od.arrayofarray = [[1, 2], [3, 4]]
    d = utils.objdict_to_dict(od)
    assert d['str'] == "str"
    assert d['int'] == 1
    assert d['array'][2] == 3
    assert d['arrayofarray'][0][0] == 1  # testing nested array
    assert d['arrayofarray'][1][1] == 4  # testing nested array
Exemplo n.º 5
0
def analyze_post(post, debug=False):
    "Perform NLP analysis"

    counters = PerfCounters()
    nlp = create_objdict()

    # clean fields
    counters.start('cleanup')
    clean_fields = generate_clean_fields(post)
    nlp.clean_fields = clean_fields
    counters.stop('cleanup')

    # creating spacy docs
    counters.start('make_spacy_docs')
    all_cleaned_content = ' '.join([clean_fields.title, clean_fields.category,
                                    " ".join(clean_fields.tags),
                                    clean_fields.abstract, clean_fields.text])

    # overall terms
    cleaned_doc = make_spacy_doc(all_cleaned_content, lang=SPACY_MODEL)

    # title terms
    title_doc = make_spacy_doc(clean_fields.title, lang=SPACY_MODEL)

    # for statistics
    text_doc = make_spacy_doc(post.text, lang=SPACY_MODEL)

    counters.stop('make_spacy_docs')

    # terms extraction
    counters.start('extract_key_terms')
    nlp.terms = extract_key_terms(cleaned_doc, num_terms=NUM_TERMS,
                                  algo=TERM_EXTRACTOR_ALGO, ngrams=NGRAMS)

    # !note we restrict ngram to one as we only want the lemmized top terms.
    nlp.title_terms = extract_key_terms(title_doc, num_terms=NUM_TERMS,
                                        algo=TERM_EXTRACTOR_ALGO, ngrams=1)

    counters.stop('extract_key_terms')

    # text stats
    counters.start('text_stats')
    nlp.stats = compute_stats(text_doc)
    counters.stop('text_stats')
    if debug:
        counters.report()
    return nlp
Exemplo n.º 6
0
def generate_clean_fields(post):
    "Generate a cleaned up version of the post and its metadata"
    clean_fields = create_objdict()

    # cleaned up fields
    clean_fields.title = ''
    if post.meta.title:
        clean_fields.title = text_cleanup(post.meta.title)

    clean_fields.abstract = ""
    if post.meta.abstract:
        clean_fields.abstract = text_cleanup(post.meta.abstract)

    clean_fields.authors = []
    if post.meta.authors:
        for author in post.meta.authors:
            clean_fields.authors.append(text_cleanup(author))

    # conference
    clean_fields.conference_name = []
    if post.meta.conference_name:
        clean_fields.conference_name = text_cleanup(
            post.meta.conference_name)

    clean_fields.conference_short_name = ""
    if post.meta.conference_short_name:
        clean_fields.conference_short_name = text_cleanup(
            post.meta.conference_short_name)

    # category, tags, etc
    clean_fields.category = ""
    if post.meta.category:
        clean_fields.category = text_cleanup(post.meta.category)

    clean_fields.tags = []
    if post.meta.tags:
        for tag in post.meta.tags:
            clean_fields.tags.append(text_cleanup(tag))

    # text
    clean_fields.text = ''
    if post.text:
        # !make sure to use post html and clean it to avoid markup keywords.
        clean_fields.text = text_cleanup(post.text)

    return clean_fields
Exemplo n.º 7
0
    def lint(self, post, rendered_post, site):
        """ Load yaml configuration

        Args:
            post (Post): the post to analyze
            rendered_post (str): the html version of the post
            site (Sitefab): the site object mainly used to get access
            to plugin data
        Return:
            dict: linting results
        """
        results = utils.create_objdict()
        results.has_errors = 0
        results.has_warnings = 0

        # frontmatter
        results.info = frontmatter.lint(post, self.test_info, self.config)

        # images
        if 'image_info' in site.plugin_data:
            image_info = site.plugin_data['image_info']
        else:
            image_info = None

        img_results = images.lint(post, self.test_info, self.config,
                                  image_info)
        results.info.extend(img_results)
        stucture_results = structure.lint(post, self.test_info, self.config)
        results.info.extend(stucture_results)

        for d in results.info:
            if d[0][0] == "E":
                results.has_errors += 1
            if d[0][1] == "W":
                results.has_warnings += 1

        if results.has_errors or results.has_warnings:
            self.results[post.filename] = results

        return results
Exemplo n.º 8
0
    def __init__(self, config_filename, version='1.0'):

        # Timers
        self.cnts = PerfCounters()
        self.cnts.start('Overall')
        self.cnts.start('Init')

        # [configuration]
        self.current_dir = Path.cwd()

        # make the config file path absolute to avoid weird cases
        self.config_filename = Path(config_filename).resolve()
        if not config_filename:
            raise Exception("Supply a configuration filename")

        # exist?
        if self.config_filename.is_file():  # absolute path
            self.config = files.load_config(self.config_filename)
        else:
            utils.error("Config file %s not found" % self.config_filename)

        # site root dir is -1 from where the config is
        self.config.root_dir = self.config_filename.parents[1]
        self.config.build = utils.create_objdict()

        # expose sitefab version to the templates
        self.config.build.sitefab_version = version

        # [parser] #

        # initialize the parser config
        parser_tpl_path = Path(self.config.parser.template_dir)
        self.config.parser.templates_path = (self.config.root_dir /
                                             parser_tpl_path)

        self.config.parser = Parser.make_config(self.config.parser)

        # [plugins]

        # loading configuration
        for d in self.config.plugins.config_dir:
            config_dir = self.config.root_dir / d

            # load the various config files from disk
            plugins_config = defaultdict(dict)
            for config_fname in files.get_files_list(config_dir, '*.yaml'):
                plugin_name = config_fname.stem
                category = "%s%s" % (str(config_fname.parts[-3]).capitalize(),
                                     str(config_fname.parts[-2]).capitalize())
                config = files.load_config(config_fname)
                plugins_config[category][plugin_name] = config

        # where to redirect the standard python log
        debug_log_fname = self.get_logs_dir() / "debug.log"
        self.plugins = Plugins(self.get_plugins_dirs(), debug_log_fname,
                               plugins_config)

        # Store data generated by plugins that can be used later.
        self.plugin_data = {}
        self.plugin_results = defaultdict(int)

        # [template rendering engine] #
        self.jinja2 = Environment(loader=FileSystemLoader(
            str(self.get_template_dir())),
                                  extensions=['jinja2.ext.do'])

        # loading templates custom functions
        custom_filters = self.plugins.get_template_filters()
        for flt_name, flt_fct in custom_filters.items():
            self.jinja2.filters[flt_name] = flt_fct

        # [logger] #
        cfg = utils.create_objdict()
        cfg.output_dir = self.get_logs_dir()
        # log template not the one from the users.
        cfg.template_dir = (self.config.root_dir /
                            self.config.logger.template_dir)

        tpl_dir = self.config.root_dir / Path(self.config.logger.template_dir)
        self.config.logger.template_dir = tpl_dir
        cfg.log_template = "log.html"
        cfg.log_index_template = "log_index.html"  # noqa
        cfg.stats_template = "stats.html"
        self.logger = Logger(cfg, self)

        # [linter] #
        linter_config_filename = (self.config.root_dir /
                                  self.config.linter.configuration_file)
        linter_config = files.load_config(linter_config_filename)

        linter_config.report_template_file = (
            self.config.root_dir / self.config.linter.report_template_file)

        linter_config.output_dir = self.get_logs_dir()
        linter_config.site_output_dir = self.get_output_dir()
        self.linter = Linter(linter_config)

        # Finding content and assets.
        self.filenames = utils.create_objdict()
        self.filenames.posts = files.get_files_list(self.get_content_dir(),
                                                    "*.md")

        # Cleanup the output directories.
        files.clean_dir(self.get_output_dir())
        self.cnts.stop('Init')
Exemplo n.º 9
0
def test_create_objdict():
    od = utils.create_objdict()
    od2 = objdict()
    assert type(od) == type(od2)