Пример #1
0
def extract_to_text(i):
    # The pdfReader variable is a readable object that will be parsed
    path = get_path()
    pdf_file_path = get_path(i, ".pdf")
    txt_file_path = get_path(i, ".txt")
    if not os.path.exists(path):
        os.makedirs(path)

    if not os.path.exists(txt_file_path):
        mock_test_file = get_past_paper(i)
        with open(pdf_file_path, "wb") as f:
            f.write(mock_test_file)

        s = 'pdf2txt -o%s %s' % (txt_file_path, pdf_file_path)
        pdf2txt.main(s.split(' ')[1:])
        os.remove(pdf_file_path)

    with open(txt_file_path, "r+", encoding="utf-8") as f:
        tmp_text = f.read()
        clean_mock_paper_text = filter_mock_paper_text(tmp_text)

    with open(txt_file_path, "wb") as f:
        f.write(str.encode(clean_mock_paper_text))

    return clean_mock_paper_text
Пример #2
0
def run(datapath, filename, options=None):
    i = path + datapath + filename + '.pdf'
    o = path + filename + '.txt'
    if options:
        s = 'pdf2txt -o%s %s %s' % (o, options, i)
    else:
        s = 'pdf2txt -o%s %s' % (o, i)
    pdf2txt.main(s.split(' ')[1:])
Пример #3
0
def run(datapath,filename,options=None):
    i=path+datapath+filename+'.pdf'
    o=path+filename+'.txt'
    if options:
        s='pdf2txt -o%s %s %s'%(o,options,i)
    else:
         s='pdf2txt -o%s %s'%(o,i)
    pdf2txt.main(s.split(' ')[1:])
Пример #4
0
def run(sample_path, options=None):
    absolute_path = absolute_sample_path(sample_path)
    with NamedTemporaryFile() as output_file:
        if options:
            s = 'pdf2txt -o{} {} {}' \
                .format(output_file.name, options, absolute_path)
        else:
            s = 'pdf2txt -o{} {}'.format(output_file.name, absolute_path)
        pdf2txt.main(s.split(' ')[1:])
Пример #5
0
 def extract_images(self, input_file):
     output_dir = mkdtemp()
     with NamedTemporaryFile() as output_file:
         commands = [
             '-o', output_file.name, '--output-dir', output_dir, input_file
         ]
         pdf2txt.main(commands)
     image_files = os.listdir(output_dir)
     rmtree(output_dir)
     return image_files
Пример #6
0
    )
    mock_test_response = session.get(pdf_mock_test_url)  # stream=True
    mock_test_response.raise_for_status()
    mock_test_file = io.BytesIO(mock_test_response.content)
    # The pdfReader variable is a readable object that will be parsed
    path = os.path.dirname(os.path.abspath(__file__)) + '/tmp/'
    file_name = "tmp_{}".format(i)
    pdf_file_path = path + file_name+".pdf"
    txt_file_path = path + file_name+".txt"
    if not os.path.exists(path):
        os.makedirs(path)
    with open(pdf_file_path, "wb") as f:
        f.write(mock_test_response.content)

    s = 'pdf2txt -o%s %s' % (txt_file_path, pdf_file_path)
    pdf2txt.main(s.split(' ')[1:])
    os.remove(pdf_file_path)

    with open(txt_file_path, "r+", encoding="utf-8") as f:
        tmp_text = f.read()
        clean_mock_paper_text = filter_mock_paper_text(tmp_text)
        f.write(clean_mock_paper_text)

        mock_test_pattern = r"(.*)(Answers\s?:)(.*)"
        mock_test_regex = re.compile(mock_test_pattern)
        g = mock_test_regex.findall(clean_mock_paper_text)
        questions_str = next(g.__iter__())[0]
        answers_str = next(g.__iter__())[2]

        # question_pattern = r"(\d+\.)(^(\d+\.))*"
        # questions_regex = re.compile(question_pattern)
Пример #7
0
        i + 1)
    mock_test_response = session.get(pdf_mock_test_url)  # stream=True
    mock_test_response.raise_for_status()
    mock_test_file = io.BytesIO(mock_test_response.content)
    # The pdfReader variable is a readable object that will be parsed
    path = os.path.dirname(os.path.abspath(__file__)) + '/tmp/'
    file_name = "tmp_{}".format(i)
    pdf_file_path = path + file_name + ".pdf"
    txt_file_path = path + file_name + ".txt"
    if not os.path.exists(path):
        os.makedirs(path)
    with open(pdf_file_path, "wb") as f:
        f.write(mock_test_response.content)

    s = 'pdf2txt -o%s %s' % (txt_file_path, pdf_file_path)
    pdf2txt.main(s.split(' ')[1:])
    os.remove(pdf_file_path)

    with open(txt_file_path, "r+", encoding="utf-8") as f:
        tmp_text = f.read()
        clean_mock_paper_text = filter_mock_paper_text(tmp_text)
        f.write(clean_mock_paper_text)

        mock_test_pattern = r"(.*)(Answers\s?:)(.*)"
        mock_test_regex = re.compile(mock_test_pattern)
        g = mock_test_regex.findall(clean_mock_paper_text)
        questions_str = next(g.__iter__())[0]
        answers_str = next(g.__iter__())[2]

        # question_pattern = r"(\d+\.)(^(\d+\.))*"
        # questions_regex = re.compile(question_pattern)