예제 #1
0
def modify_file_content():
    # 获得桌上所有 md 文件名并对其进行分割,然后在 for 循环里进行处理
    fir_dir = "/Users/Daglas/Desktop/*.md"
    # fir_dir = "C:\\Users\\dell\\Desktop\\mdFiles\\*.md"

    for infile in glob.glob(fir_dir):
        file_name, ext = os.path.splitext(infile)
        # 读取文件,文件名「file_name + ".md"」是关键
        with open(file_name + ".md", encoding='UTF-8') as file_obj:
            lines = file_obj.readlines()
        # 对文字处理并写入文件
        with open(file_name + ".md", 'w', encoding='UTF-8') as file_obj:
            for line in lines:
                if line != '\n':
                    new_content = md.modify_text(line)
                    file_obj.write(new_content + "\n\n")
예제 #2
0
    def changePdfToText(self, filePath):
        # 以二进制读模式打开
        file = open(path, 'rb')
        #用文件对象来创建一个pdf文档分析器
        praser = PDFParser(file)
        # 创建一个PDF文档对象存储文档结构,提供密码初始化,没有就不用传该参数
        doc = PDFDocument(praser, password='')
        ##检查文件是否允许文本提取
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed

        # 创建PDf 资源管理器 来管理共享资源,#caching = False不缓存
        rsrcmgr = PDFResourceManager(caching=False)
        # 创建一个PDF设备对象
        laparams = LAParams()
        # 创建一个PDF页面聚合对象
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解析器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 获得文档的目录(纲要),文档没有纲要会报错
        #PDF文档没有目录时会报:raise PDFNoOutlines  pdfminer.pdfdocument.PDFNoOutlines
        # print(doc.get_outlines())

        # 获取page列表
        print(PDFPage.get_pages(doc))
        # 循环遍历列表,每次处理一个page的内容
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
            # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
            for x in layout:
                if hasattr(x, "get_text"):
                    fileNames = os.path.splitext(filePath)
                    with open(fileNames[0] + '.txt', 'a+') as f:
                        results = x.get_text()
                        results = md.modify_text(results)
                        # print(results)
                        f.write(results + '\n\n')
예제 #3
0
def split_files():
    fir_dir = "/Users/Daglas/Desktop/*.txt"
    # fir_dir = "C:\\Users\\dell\\Desktop\\mdFiles\\*.txt"
    # 获得桌上所有 txt 文件名并对其进行分割,然后在 for 循环里进行处理
    for infile in glob.glob(fir_dir):
        filename, ext = os.path.splitext(infile)

        # 读取文件
        with open(filename + ".txt", encoding='UTF-8') as file_obj:
            lines = file_obj.readlines()

        # 对文字处理并写入文件
        with open(filename + ".txt", 'w', encoding='UTF-8') as file_obj:
            for line in lines:
                if line != '\n':
                    new_content = md.modify_text(line)
                    file_obj.write(new_content + '\n\n')

        print(len(lines))

        # 读取文件
        with open(filename + ".txt", encoding='UTF-8') as file_obj:
            lines = file_obj.readlines()

        # 一定要确认传入的锚点字符在文档里是唯一的,可以直接在文档里搜索来确认
        split_point_list = [
            'wisdom about them can be better than individual wisdom',
            'technological progress by establishing a backdrop of unknown',
            'which I argue that in engineering, models are stacked many layers deep',
            'ideas are more durable than the hardware itself',
            'argue that the layers of paradigms for software are so deep',
            'I argue that technology revolutions differ from scientific revolutions',
            'which I examine the concept of information',
            'I explain what software cannot do and show that the number',
            'which I go beyond the countable world of computing and argue that',
            'argue that determinism is a property of models not of the physical world',
            'model of uncertainty about a system and not directly a model of that',
            ' analyze what is holding back technology advancement',
            'These are the main reasons I wrote this book'
        ]

        n1 = 101
        n2 = 1001

        for chunk in chunks(lines, split_point_list):
            if n1 < 1000:
                with open(filename + "0" + str(n1 - 100) + ".md",
                          'w',
                          encoding='UTF-8') as file_obj:
                    for line in chunk:
                        if line != '\n':
                            file_obj.write(line + '\n')
                n1 += 100

            else:
                with open(filename + str(n2 - 100) + ".md",
                          'w',
                          encoding='UTF-8') as file_obj:
                    for line in chunk:
                        if line != '\n':
                            file_obj.write(line + '\n')
                    n2 += 100
예제 #4
0
# -*- coding: utf-8 -*-
import glob, os

import modify as md

# 获得桌上所有 md 文件名并对其进行分割,然后在 for 循环里进行处理
for infile in glob.glob("/Users/Daglas/Desktop/*.txt"):
    filename, ext = os.path.splitext(infile)

    # 读取文件,文件名「filename + ".md"」是关键
    with open(filename + ".txt") as file_obj:
        lines = file_obj.readlines()

    # 对文字处理并写入文件
    with open(filename + ".md", 'w') as file_obj:
        for line in lines:
            if line != '\n':
                new_content = md.modify_text(line)
                file_obj.write(new_content + '\n\n')
예제 #5
0
# 获得桌上所有 md 文件名并对其进行分割,然后在 for 循环里进行处理
for infile in glob.glob("/Users/Daglas/Desktop/*.json"):
    filename, ext = os.path.splitext(infile)

    # 读取文件,文件名「filename + ".md"」是关键
    with open(filename + ".json") as file_obj:
        dictls = json.load(fp=file_obj)
        # print(dictls[:8])

    views = []
    for reviews in sorted(dictls,
                          key=lambda e: e.__getitem__('collect'),
                          reverse=True):
        views.append(reviews['title'])
        views.append(reviews['author'])
        views.append(reviews['date'])
        views.append(reviews['url'])
        # views.append(reviews['collect'])
        views.append(reviews['readview'])

    # print(views[:4])

    # 对文字处理并写入文件
    with open(filename + ".md", 'w') as file_obj:
        for view in views:
            for line in view:
                if line != '\n':
                    # line = line.replace('\n', '')
                    line = md.modify_text(line)
                    file_obj.write(line + "\n\n")