def post(self, request, format=None): # tipo_analisis = request.POST['tipo_analisis'] uploaded_file = request.FILES['file'] file_name = uploaded_file.name file_extension = file_name.split(".")[1] destination = open('backendFondecyt/Docs/' + file_name, 'wb+') for chunk in uploaded_file.chunks(): destination.write(chunk) destination.close() if (file_extension == "doc"): file_name = self.converDocToDocx(file_name) if (file_extension == "doc" or file_extension == "docx"): with open('backendFondecyt/Docs/' + file_name, "rb") as docx_file: rawText = mammoth.extract_raw_text(docx_file).value html = mammoth.convert_to_html(docx_file).value if (file_extension == "txt"): txt_file = open('backendFondecyt/Docs/' + file_name, "r", encoding="utf-8") rawText = txt_file.read() html = "" for line in txt_file: stripped_line = line.rstrip() if (stripped_line.strip() != ""): html += "<p>" + line + "</p>" txt_file.close() payload = {'texto': rawText, 'html': html} data = requests.post('http://redilegra.com/general', data=payload) data = json.loads(data.text.encode('utf8')) os.remove('backendFondecyt/Docs/' + file_name) return Response(data, status.HTTP_201_CREATED)
def compose_request(self): html_template = Template( mammoth.convert_to_html(self.template_path).value) context = Context(self.request_information) text_template = Template( mammoth.extract_raw_text(self.template_path).value) unprocessed_text = text_template.render(context) # in order to allow for line spacing, need br and p coming from Django context render text_content = re.sub(r"\<\/?p\>", "", re.sub(r"\<br\>", "\n", unprocessed_text)) return html_template.render(context), text_template.render(context)
def extract_from_mammoth(source_dir, file, target_format): import mammoth with open(os.path.join(source_dir, file), 'rb') as f: if target_format == "html": document = mammoth.convert_to_html(f) else: document = mammoth.extract_raw_text(f) text = document.value #print(text) return [text]
def extract_content_from_document(self, filename): """ Extract content from a .docx file and return a (text, html) tuple. """ ext = os.path.splitext(filename)[1] if ext == '.docx': with open(filename, "rb") as f: html = mammoth.convert_to_html(f).value text = mammoth.extract_raw_text(f).value return (text, html) else: # TODO: handle .doc raise ValueError("Can only handle .docx files, but got %s" % ext)
def validate_template_extension(foia_template): _validate_extension(foia_template, VALID_FOIA_EXTENSIONS) # Make sure we'll have no problem reading or submitting this request # https://stackoverflow.com/questions/2472422/django-file-upload-size-limit -- 3rd answer limit = 2 * 1024 * 1024 if foia_template.size > limit: raise ValidationError("File too large. Size should not exceed 2 MiB.") record_regex = re.compile(r"{{\s*requested_records(?:\s*\|\s*\w+)\s*}}") document_text = mammoth.extract_raw_text(foia_template).value if not record_regex.search(document_text): raise ValidationError( "You need to place '{{requested_records}}' somewhere in the body of your template" )
def readFile(p, i): mergedLines = [] if (i.endswith(".pdf")): # creating a pdf File object of original pdf pdfFileObj = open(path.join(p, i), 'rb') # creating a pdf Reader object pdfReader = PyPDF2.PdfFileReader(pdfFileObj) for page in range( pdfReader.numPages): # add all page text to merged text # creating page object pageObj = pdfReader.getPage(page) # add pdf's text to text text = pageObj.extractText() # add lines lines = [text] lines = re.sub( "[^\w]", " ", lines[0]).split() # change unnecessary symbols to null mergedLines = mergedLines + lines pdfFileObj.close() elif (i.endswith(".txt")): txtFile = open(path.join(p, i), "r") lines = txtFile.readlines() lines = re.sub("[^\w]", " ", lines[0]).split() # change unnecessary symbols to null mergedLines = mergedLines + lines elif (i.endswith(".docx")): with open(path.join(p, i), "rb") as docx_file: result = mammoth.extract_raw_text(docx_file) text = result.value lines = [text] lines = re.sub( "[^\w]", " ", lines[0]).split() # change unnecessary symbols to null mergedLines = mergedLines + lines mergedLines = [i.lower() for i in mergedLines ] # Convert all characters to lowercase before tokenizing. mergedLines = [ ''.join(c for c in s if c not in string.punctuation) for s in mergedLines ] # Tokenizing our text mergedLines = [i for i in mergedLines if i not in stop] # Add all word if it is not stop word. #print(mergedLines) return mergedLines # return processed text
def upload_version(request, contract_id): if request.method == 'POST': form = VersionForm(request.POST, request.FILES) if form.is_valid(): obj = form.save(commit=False) # process the form data to extract the word document text obj.text = extract_raw_text(obj.file).value obj.contract = Contract.objects.get(pk=contract_id) obj.uploaded_by = get_first_name() obj.save() make_amendments(obj.contract, obj) return HttpResponseRedirect('/contracts/view/' + str(contract_id))
def read_and_clean_file(file_name, destination_path, stop_words): text = [] if file_name.endswith(".txt"): with open(path.join(destination_path, file_name), 'r') as text_file: # iterate through raw list and use regex to clean non-alphanumeric symbols raw_line = text_file.readline() while raw_line: clean_line = re.sub("[^a-zA-Z_şŞğĞüÜİöÖçÇı]", " ", raw_line).split() text += clean_line raw_line = text_file.readline() elif file_name.endswith(".docx"): with open(path.join(destination_path, file_name), 'rb') as docx_file: # use mammoth package to convert docx to text raw_content = [mammoth.extract_raw_text(docx_file).value] # use regex to to clean non-alphanumeric symbols clean_content = re.sub("[^a-zA-Z_şŞğĞüÜİöÖçÇı]", " ", raw_content[0]).split() text += clean_content elif file_name.endswith(".pdf"): with open(path.join(destination_path, file_name), 'rb') as pdf_file: # use PyPDF2 package to convert pdf to text pdf_reader = PyPDF2.PdfFileReader(pdf_file) # iterate through pages and use regex to clean non-alphanumeric symbols for page in range(0, pdf_reader.numPages): raw_content = [pdf_reader.getPage(page).extractText()] clean_content = re.sub("[^a-zA-Z_şŞğĞüÜİöÖçÇı]", " ", raw_content[0]).split() text += clean_content # convert letters to lower forms text = [letter.lower() for letter in text] # generate stop words list stop_words_list = set(stopwords.words('english')) for stop_word in stop_words: stop_words_list.add(stop_word) # clean stop words and punctuations keywords = [ word for word in text if word not in stop_words_list and word not in string.punctuation ] return keywords
def process_data(self, directory): for f in os.listdir(directory): with open('{}/{}'.format(directory, f), 'rb') as flav: print("Processing file ---> {}".format(os.path.basename(f))) try: # result = mammoth.convert_to_html(flav) # doc_results = result.value result = mammoth.extract_raw_text(flav) doc_results = result.value results = self.denoise_text(doc_results) meta = self.get_POS(results) status = self.create_dataset(meta) except Exception as e: print(e) continue model = models.Word2Vec(all_sentences, min_count=1) model.save_word2vec_format("models/word2vec.bin")
def index(request): if request.method == 'POST': form = ContractTemplateForm(request.POST, request.FILES) if form.is_valid(): obj = form.save(commit=False) # process the form data to extract the word document text obj.text_content = extract_raw_text(obj.original_file).value obj.save() return HttpResponseRedirect('/templates') else: form = ContractTemplateForm() categories = ContractCategory.objects.all() context = {'categories': categories, 'form': form} return render(request, 'templateStorage/index.html', context)
def readFilesInDirAndCleanWordList(stop): dire = path.dirname(__file__) p = os.path.join(dire, 'text files/') mergedLines = [] for i in os.listdir(os.path.join(dire, 'text files')): if (i.endswith('.txt')): file = open(path.join(p, i), "r") lines = file.readlines() lines = re.sub("[^\w]", " ", lines[0]).split() mergedLines = mergedLines + lines elif (i.endswith('.docx')): with open(path.join(p, i), "rb") as docx_file: result = mammoth.extract_raw_text(docx_file) text = result.value lines = [text] lines = re.sub("[^\w]", " ", lines[0]).split() mergedLines = mergedLines + lines elif (i.endswith('.pdf')): pdfFileObj = open(path.join(p, i), 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) for i in range(0, pdfReader.numPages): pageObj = pdfReader.getPage(i) text = pageObj.extractText() lines = [text] lines = re.sub("[^\w]", " ", lines[0]).split() mergedLines = mergedLines + lines mergedLines = [ ''.join(c for c in s if c not in string.punctuation) for s in mergedLines ] mergedLines = [i.lower() for i in mergedLines] mergedLines = [i for i in mergedLines if i not in stop] return mergedLines
def can_extract_raw_text(): with open(test_path("simple-list.docx"), "rb") as fileobj: result = mammoth.extract_raw_text(fileobj=fileobj) assert_equal([], result.messages) assert_equal("Apple\n\nBanana\n\n", result.value)
def set_text(self): return mammoth.extract_raw_text(self.file).value
# In[ ]: type(entries) # In[ ]: import glob import errno import mammoth import pandas as pd profiles = [] path = r'C:\Users\abdul\Downloads\RESUMES\*.docx' files = glob.glob(path) for name in files: document = open(name, 'rb') profiles.append((mammoth.extract_raw_text(document).value)) # In[ ]: files # In[ ]: len(profiles) # In[ ]: all_Nigeria_uni = { 'air force institute of technology', 'alex ekwueme university', 'federal university gashua', 'federal university dutse', 'federal university gusau ', 'federal university kashere',