Exemplo n.º 1
0
def get_question_feed(url, force_reload=False):
    """Retrieve the last questions of the feed

    Returns a structure with the following format:
      [Question_1, Question_2, ...]

    where Question_n has the following keys:
      link: str
      title: str
      body: str (html)
      tags: list of str
    """

    log(bold("Fetching question feed"))
    if force_reload:
        log(fg("Force reload", magenta))
    feed = spider.get_feed(url, force_reload=force_reload)
    if feed.status == 304:  # Not Modified
        log(fg("Feed not modified since last retrieval (status 304)", magenta))
        return []
    log("Number of entries in feed: {}", fg(len(feed.entries), green))
    questions = []
    for entry in feed.entries:
        soup = BeautifulSoup(entry.summary, "html.parser")
        q = {
            "link": entry.link,
            "title": entry.title,
            "body": soup.getText(" ", strip=True),
            "tags": [x["term"] for x in entry.tags],
        }
        questions.append(q)
    return questions
Exemplo n.º 2
0
def warn(msg, *argv):
    """Print an error message and aborts execution"""
    err_off = sys.stderr not in _logs
    if err_off:
        add_stderr()
    log(fg(msg, magenta), *argv, who=_get_caller())
    _logs.pop()
Exemplo n.º 3
0
def abort(msg, *argv):
    """Print an error message and aborts execution"""

    if sys.stderr not in _logs:
        add_stderr()
    log(fg(msg, red), *argv, who=_get_caller())
    print_advice()
    close_logs()
    exit()
Exemplo n.º 4
0
def get(url, delay=2, use_cache=True, max_delta=td(hours=12)):
    """Respectful wrapper around requests.get"""

    useragent = "Answerable v0.1"

    # If a cached answer exists and is acceptable, then return the cached one.

    cache_file = url.replace("/", "-")
    if use_cache:
        log("Checking cache before petition {}", fg(url, yellow))
        hit, path = cache.check("spider", cache_file, max_delta)
        if hit:
            with open(path, "r") as fh:
                res = fh.read().replace("\\r\\n", "")
            return _FalseResponse(200, res)

    # If the robots.txt doesn't allow the scraping, return forbidden status
    if not ask_robots(url, useragent):
        log(fg("robots.txt forbids {}", red), url)
        return _FalseResponse(403, "robots.txt forbids it")

    # Make the request after the specified delay
    # log("[{}] {}".format(fg("{:4.2f}".format(delay), yellow), url))
    log("Waiting to ask for {}", fg(url, yellow))
    log("  in {:4.2f} seconds", delay)
    sleep(delay)
    headers = {"User-Agent": useragent}
    log("Requesting")
    res = requests.get(url, timeout=10, headers=headers)
    # Exit the program if the scraping was penalized
    if res.status_code == 429:  # too many requests
        abort("Too many requests")

    # Cache the response if allowed by user
    if use_cache:
        cache.update("spider",
                     cache_file,
                     res.content.decode(res.encoding),
                     json_format=False)

    return res
Exemplo n.º 5
0
def update(category: str, _file: str, obj, json_format=True):
    """Update or create a file in the cache

    Parameters:
    category: Folder inside the cache.
    _file: File name to store in.
    obj: Serializable object to store.
    """

    subpath = pathlib.Path(category) / _file
    path = pathlib.Path.cwd() / __cache_dir / subpath
    path.parent.mkdir(parents=True, exist_ok=True)
    try:
        with open(path, "w") as fh:
            if json_format:
                json.dump(obj, fh, indent=2)
            else:
                fh.write(obj)
        log("  Cache updated: {}", fg(subpath, green))
    except OSError as err:
        log("  {}: {}", err, fg(subpath, magenta))
        return False, path
Exemplo n.º 6
0
def check(category: str, _file: str, max_delta: td) -> (bool, pathlib.Path):
    """Return if a file is cached and where it is located.

    Returns:
    (B, P) where
    - B is true if the content is cached and usable
    - P is the path where the cached content is/should be.

    Parameters:
    category: Folder inside the cache.
    _file: File name to look for.
    max_delta: Timedelta used as threshold to consider a file too old.
    """

    # Prepare the path to the cached file
    subpath = pathlib.Path(category) / _file
    path = pathlib.Path.cwd() / __cache_dir / subpath
    path.parent.mkdir(parents=True, exist_ok=True)

    try:
        if not path.exists():
            log("  Miss {}", fg(subpath, magenta))
            return False, path
        else:
            # Check if the file is too old
            log("  Hit {}", fg(subpath, green))
            modified = dt.fromtimestamp(path.stat().st_mtime)
            now = dt.now()
            delta = now - modified
            log("  Time passed since last fetch: {}", delta)
            valid = delta < max_delta
            if valid:
                log(fg("  Recent enough", green))
            else:
                log(fg("  Too old", magenta))
            return valid, path
    except OSError as err:
        log("  {}: {}", err, fg(subpath, magenta))
        return False, path
Exemplo n.º 7
0
def get_QA(user_id, force_reload=False, max_page=5):
    """Retrieve information about the questions answered by the user

    Return
        [
            (Question_1, Answer_1),
            (Question_2, Answer_2),
            ...
        ]
    See
        get_questions, get_user_answers
    """

    log(bold("Fetching user information"))
    if force_reload:
        log(fg("Force reload", magenta))
    cache_file = str(user_id) + ".json"
    # Check cache
    if not force_reload:
        hit, fpath = cache.check(cache_where, cache_file, cache_threshold)
        if hit:
            with open(fpath) as fh:
                stored = json.load(fh)
            return stored
    # Get the answers
    answers = get_user_answers(user_id, force_reload, max_page)

    # Get the questions
    q_ids = [str(a["question_id"]) for a in answers]
    questions = get_questions(q_ids)

    # Join answers and questions
    user_qa = [(q, a) for q in questions for a in answers
               if q["question_id"] == a["question_id"]]
    cache.update(cache_where, cache_file, user_qa)
    for q, a in user_qa:
        a["tags"] = q["tags"]

    ## Include questions specified by user
    try:
        with open("include.txt", "r") as f:
            extra_q_ids = f.read().split()
        log("Aditional training: " + str(extra_q_ids))
        extra_questions = get_questions(extra_q_ids)
    except FileNotFoundError:
        extra_questions = []
        log("No additional training specified by user")
    user_qa += [(q, None) for q in extra_questions]

    return user_qa
Exemplo n.º 8
0
def get_feed(url, force_reload=False):
    """Get RSS feed and optionally remember to reduce bandwith"""

    useragent = "Answerable RSS v0.1"
    log("Requesting feed {}", fg(url, yellow))
    cache_file = url.replace("/", "_")

    # Get the conditions for the GET bandwith reduction
    etag = None
    modified = None
    if not force_reload:
        hit, path = cache.check("spider.rss", cache_file, td(days=999))
        if hit:
            with open(path, "r") as fh:
                headers = json.load(fh)
                etag = headers["etag"]
                modified = headers["modified"]
        log("with {}: {}", bold("etag"), fg(etag, yellow))
        log("with {}: {}", bold("modified"), fg(modified, yellow))

    # Get the feed
    feed = feedparser.parse(url, agent=useragent, etag=etag, modified=modified)

    # Store the etag and/or modified headers
    if feed.status != 304:
        etag = feed.etag if "etag" in feed else None
        modified = feed.modified if "modified" in feed else None
        new_headers = {
            "etag": etag,
            "modified": modified,
        }
        cache.update("spider.rss", cache_file, new_headers)
        log("Stored new {}: {}", bold("etag"), fg(etag, green))
        log("Stored new {}: {}", bold("modified"), fg(modified, green))

    return feed
Exemplo n.º 9
0
    def cf(x):
        """Color a value according to its value"""

        return (displayer.fg(x, displayer.green) if x == 0 else displayer.fg(
            x, displayer.magenta))
Exemplo n.º 10
0
def recommend(args):
    """Recommend questions from the latest unanswered"""

    filtered = {"hidden": 0, "closed": 0, "duplicate": 0}

    def valid_entry(entry):
        """Check if a entry should be taken into account"""

        if len(set(entry["tags"]) & hide_tags) > 0:
            filtered["hidden"] += 1
            return False
        if entry["title"][-8:] == "[closed]":
            filtered["closed"] += 1
            return False
        if entry["title"][-11:] == "[duplicate]":
            filtered["duplicate"] += 1
            return False
        return True

    def cf(x):
        """Color a value according to its value"""

        return (displayer.fg(x, displayer.green) if x == 0 else displayer.fg(
            x, displayer.magenta))

    # Load configuration
    config = load_config(args)

    # Load the model
    try:
        model_name = config["model"]
        log.log("Loading model {}", displayer.fg(model_name, displayer.yellow))
        model = importlib.import_module(f".{model_name}", "models")
        log.log("Model {} succesfully loaded",
                displayer.fg(model_name, displayer.green))
    except ModuleNotFoundError as err:
        if err.name == f"models.{model_name}":
            log.abort("Model {} not present", model_name)
        else:
            log.abort("Model {} unsatisfied dependency: {}", model_name,
                      err.name)

    # Get user info and feed
    user_qa = fetcher.get_QA(config["user"], force_reload=args.f)
    if args.all or "tags" not in config:
        tags = ""
    else:
        tags = "tag?tagnames="
        tags += "%20or%20".join(config["tags"]["followed"]).replace("+", "%2b")
        tags += "&sort=newest"
    url = "https://stackoverflow.com/feeds/" + tags
    try:
        feed = fetcher.get_question_feed(url, force_reload=args.F)
        if len(feed) == 0:
            raise ValueError("No feed returned")
        # Filter feed from ignored tags
        hide_tags = (set() if args.all or "tags" not in config else set(
            config["tags"]["ignored"]))
        useful_feed = [e for e in feed if valid_entry(e)]
        if len(useful_feed) == 0:
            raise ValueError("All feed filtered out")
        log.log(
            "Discarded: {} ignored | {} closed | {} duplicate",
            cf(filtered["hidden"]),
            cf(filtered["closed"]),
            cf(filtered["duplicate"]),
        )

        # Make the recommendation
        log.log(f"Corpus size: {len(user_qa)} Feed size: {len(useful_feed)}")
        rec_index, info = model.recommend(user_qa, useful_feed)
        selection = [useful_feed[i] for i in rec_index[:args.limit]]
        if args.info and info is None:
            log.warn("Info requested, but model {} returns None", model_name)
        elif args.info and info is not None:
            info = [info[i] for i in rec_index[:args.limit]]
        displayer.disp_feed(selection, info, args.info)
    except ValueError as err:
        log.warn(err)
        log.print_advice()