Пример #1
0
def cli():
    p = argparse.ArgumentParser()
    p.add_argument("blogpath", help="directory of Markdown posts to analyze")
    p.add_argument("xlsfile", help="path of .xlsx file to write")
    p.add_argument("--only", help="part of page to analyze", choices=["title", "description"])
    p.add_argument("-ext", help="filename suffix", default=".md")
    P = p.parse_args()

    blog_path = Path(P.blogpath).expanduser()
    xlsx = Path(P.xlsfile).expanduser()
    xlsx.parent.mkdir(parents=True, exist_ok=True)

    if blog_path.is_file():
        files = [blog_path]
    elif blog_path.is_dir():
        files = list(blog_path.rglob(f"*{P.ext}"))
    else:
        raise NotADirectoryError(blog_path)

    cols = ["pos", "neu", "neg", "compound"]
    if P.only:
        cols.append(P.only)

    dat = pandas.DataFrame(index=[f.stem for f in files], columns=cols)

    now = datetime.datetime.now()

    for i, file in enumerate(files):
        print(f"{i+1} / {len(files)} {file.stem:<80}", end="\r")

        header = hugoutils.get_header(file)[0]
        if header is not None and "expiryDate" in header:
            if datetime.datetime.strptime(header["expiryDate"][:10], "%Y-%m-%d") < now:
                print("skip", file)
                continue

        if P.only:
            try:
                text = header[P.only]
            except TypeError:
                continue
            except KeyError:
                logging.error(f"{file.stem} does not have {P.only}")
                continue
        else:
            text = file.read_text(errors="ignore")

        s = analyze_post(text)

        if P.only:
            dat.loc[file.stem] = [s["pos"], s["neu"], s["neg"], s["compound"], text]
        else:
            dat.loc[file.stem] = [s["pos"], s["neu"], s["neg"], s["compound"]]

    if blog_path.is_file():
        print(dat)
    else:
        dat.to_excel(xlsx)
Пример #2
0
def get_tags(path: Path, taxonomy_type: str) -> set[str]:
    files = list(path.glob("*.md"))
    dat: set[str] = set()

    for f in files:
        header = hugoutils.get_header(f)[0]
        try:
            tags = header[taxonomy_type]
        except (TypeError, KeyError):
            continue

        for tag in tags:
            dat.add(tag)

    return dat
Пример #3
0
p.add_argument("path", help="path to read Markdown blog files")
p.add_argument("xlsx", help="excel filename to write")
p.add_argument("-ext", help="filename suffix", default=".md")
p = p.parse_args()

inpath = Path(p.path).expanduser()
if not inpath.is_dir():
    raise NotADirectoryError(inpath)

xlsx = Path(p.xlsx).expanduser()

files = list(inpath.rglob(f"*{p.ext}"))
dat: dict[str, int] = {}

for f in files:
    header = hugoutils.get_header(f)[0]
    try:
        tags = header["tags"]
    except (TypeError, KeyError):
        continue
    except Exception as e:
        logging.error(f"{e}: {f.stem}")

    for tag in tags:
        try:
            dat[tag] += 1
        except KeyError:
            dat[tag] = 1

pandas.DataFrame(index=dat.keys(), data=dat.values(),
                 columns=["count"]).to_excel(xlsx)