예제 #1
0
 def parsePhone():
     tika.initVM()
     parsed = parser.from_file("MyResume.docx")
     regular_expression = re.compile(r"[[+91]*[' ']*[0-9]{10}]*",
                                     re.IGNORECASE)
     result = re.search(regular_expression, parsed["content"])
     print("phone No:" + result.group())
예제 #2
0
async def convert_pdf_to_txt(pdf_path: str, save_dir: str) -> None:
    """
    This function converts a pdf file to a txt file. It cleans the text.
    
    Parameters:
    pdf_path (str): The path where the pdf to covert is located
    save_dir (str): The path where to save the converted pdf
    
    Returns:
    None
    """
    if not hasattr(convert_pdf_to_txt, 'nlp'):
        convert_pdf_to_txt.nlp = spacy.load(ACCEPTED_LANGUAGES['es'])
        convert_pdf_to_txt.nlp.add_pipe(convert_pdf_to_txt.nlp.create_pipe('sentencizer'))
    try:
        tika.initVM()
        pdf_file = parser.from_file(pdf_path)
        async with AIOFile(save_dir, 'w') as text_file:
            doc = convert_pdf_to_txt.nlp(pdf_file['content'])
            #print(doc)
            text = ''.join([re.sub(r'[,|;|\b]\n+\b', '\n', re.sub(r'\b\n+\b', '\n', s.text))
                            for s in doc.sents]) # Fix sentences that have more newlines than they should
            paragraphs = split_text_into_paragraphs(text) # Eliminate extra newlines between paragraphs
            new_text = '\n\n'.join(paragraphs)
            new_text = re.sub(r'-\s*\n+', '', new_text) # Join split words.
            print(new_text)
            await text_file.write(new_text)

    except Exception as e:
        raise e
예제 #3
0
def main():
    url = 'https://www.harrisonburgva.gov/sites/default/files/Police/files/POLICIES/Use_of_Force-1.pdf'
    file_path = '/Users/dturcan/Docs/campaign_zero/use_of_force_docs/harrisonburg_va.pdf'

    # Extract all of the lines
    tika.initVM()
    parsed = parser.from_file(file_path)
    content = parsed["content"]
    uof_parser = UOFParser(content)

    # Read in config
    try:
        config = yaml.safe_load(open('config.yaml'))
    except FileNotFoundError:
        config = yaml.safe_load(open('uof_parser/config.yaml'))

    # Run indicators:
    for policy, policy_indicators in config.items():
        print('-------------')
        print("Checking", policy)
        result = uof_parser.perform_search(
            policy_indicators.get('search_terms', []),
            policy_indicators.get('phrases_for_positive_indicator', []))
        print()
        print(policy, ":", result[0])
        print('Context:')
        print(result[1])
        print()
예제 #4
0
 def __init__(self, server_url=None):
     if server_url:
         os.environ['TIKA_CLIENT_ONLY'] = 'True'
         os.environ['TIKA_SERVER_ENDPOINT'] = server_url
         print("Tika Server Endpoint %s" % os.environ['TIKA_SERVER_ENDPOINT'])
     import tika
     tika.initVM()
예제 #5
0
 def parseEmail():
     tika.initVM()
     parsed = parser.from_file("MyResume.docx")
     regular_expression = re.compile(
         r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}", re.IGNORECASE)
     result = re.search(regular_expression, parsed["content"])
     print("Email id:" + result.group())
def main():

    count = 0
    data_files = '/home/nimesh/Desktop/1000'

    output_file = open('grobidquantity_data_test.json', 'w+')
    for root, dirs, files in os.walk(data_files):
        for file in files:

             try:
                 path=''
                 if(file!='.DS_Store'):
                    count+=1
                    print count
                    path=os.path.join(root, file)
                    tika.initVM()
                    parsed = tika.parser.from_file(path)

                    if("content" in parsed.keys()):
           
                        type=parsed.get("metadata").get("Content-Type")
                        print type
                        content=parsed["content"]
                        mycontent = content.encode("UTF-8")
                        
                        if(content is not None and ('application/pdf' in type or 'application/xml' in type or 'text/plain' in type)):
                            p=os.popen('curl -GET --data-urlencode'+' '+'"text='+mycontent+'"'+' '+'localhost:8080/processQuantityText').read()
                            json.dump(p, output_file)
                        output_file.write('\n')
             except:
                continue           
    output_file.close()
예제 #7
0
def fileworker(filequeue, dbqueue, monitorqueue, uforiamodules, config,
               rcontext):
    """
Receives a file item from file_scanner inside the filequeue and
executes the file_processor for that file. The fileworker operates
as the entry point for each process, and is therefore also responsible
for the execution of any expensive library initialization code.

filequeue - The file queue
dbqueue - The database queue
monitorqueue - The monitoring queue to show information about the
current file
uforiamodules - The uforia module objects from modulescanner
config - The uforia configuration file
rcontext - The recursion context
"""
    # Start the JCC JVM runtime for Tika
    if not rcontext.jvm_initialized:
        import tika
        tika.initVM()
        rcontext.jvm_initialized = True

    while True:
        item = filequeue.get()
        if item == None:
            # Finished.
            break
        else:
            file_processor(item, dbqueue, monitorqueue, uforiamodules, config,
                           rcontext)
            filequeue.task_done()
    filequeue.task_done()
예제 #8
0
파일: uforia.py 프로젝트: uforia/Uforia
def fileworker(filequeue, dbqueue, monitorqueue, uforiamodules, config,
               rcontext):
    """
Receives a file item from file_scanner inside the filequeue and
executes the file_processor for that file. The fileworker operates
as the entry point for each process, and is therefore also responsible
for the execution of any expensive library initialization code.

filequeue - The file queue
dbqueue - The database queue
monitorqueue - The monitoring queue to show information about the
current file
uforiamodules - The uforia module objects from modulescanner
config - The uforia configuration file
rcontext - The recursion context
"""
    # Start the JCC JVM runtime for Tika
    if not rcontext.jvm_initialized:
        import tika
        tika.initVM()
	rcontext.jvm_initialized = True

    while True:
        item = filequeue.get()
        if item == None:
            # Finished.
            break
        else:
            file_processor(item, dbqueue, monitorqueue, uforiamodules,
                           config, rcontext)
            filequeue.task_done()
    filequeue.task_done()
def pdf2text(file):
    tika.initVM()
    parsed = parser.from_file(file)
    data = parsed["content"]
    list_sen = data.split('\s{4,}')
    for i in range(0, len(list_sen)):
        list_sen[i] = " ".join(list_sen[i].split())
    return annotator.tokenize(list_sen[0])
예제 #10
0
 def __init__(self, **kwargs):
     server_url = kwargs['tika_url']
     if server_url:
         os.environ['TIKA_CLIENT_ONLY'] = 'True'
         os.environ['TIKA_SERVER_ENDPOINT'] = server_url
         print("Tika Server Endpoint %s" % os.environ['TIKA_SERVER_ENDPOINT'])
     import tika
     tika.initVM()
예제 #11
0
    def __init__(self,
                 beanstalk_host: Text = "127.0.0.1",
                 beanstalk_port: int = 11300):

        self.client: Optional[greenstalk.Client] = None
        self.connect(beanstalk_host, beanstalk_port)

        logging.info("initialize tika VM")
        tika.initVM()
예제 #12
0
 def get_content(self, data):
     import tika
     tika.initVM()
     try:
         parsed = tika.parser.from_file('/path/to/file')
         print(parsed["metadata"])
         return parsed["content"]
     except:
         return "Error parsing content"
예제 #13
0
    def __init__(self, tika_server_url):
        super(TikaParser, self).__init__('tika_parser')

        if tika_server_url:
            os.environ['TIKA_CLIENT_ONLY'] = 'True'
            os.environ['TIKA_SERVER_ENDPOINT'] = tika_server_url
            print("Tika Server Endpoint %s" %
                  os.environ['TIKA_SERVER_ENDPOINT'])
        tika.initVM()
예제 #14
0
 def __init__(self, path, **kwargs):
     if type(self).parser is None:
         # Tika is conditionally imported here
         import tika
         # automatically downloads tika jar and starts a JVM processif no REST API
         # is configured in ENV
         tika.initVM()
         from tika import parser as tk_parser
         type(self).parser = tk_parser
     super(TikaPreprocessor, self).__init__(path, **kwargs)
예제 #15
0
 def __init__(self, path, **kwargs):
     if type(self).parser is None:
         # Tika is conditionally imported here
         import tika
         # automatically downloads tika jar and starts a JVM processif no REST API
         # is configured in ENV
         tika.initVM()
         from tika import parser as tk_parser
         type(self).parser = tk_parser
     super(TikaPreprocessor, self).__init__(path, **kwargs)
예제 #16
0
def extract_and_store(ftype):
    if not os.path.exists("../data/{}_raw_text".format(ftype)):
        os.mkdir("../data/{}_raw_text".format(ftype))
    tika.initVM()
    for fidx, file in enumerate(os.listdir("../data/{}_pdfs".format(ftype))):
        print("{} -- {}".format(fidx, file))
        raw_text = extract_raw_text(ftype, file).encode('utf8')
        raw_text_path = "../data/{}_raw_text/{}".format(ftype, file.replace(".pdf",".txt"))
        with open(raw_text_path, "wb") as f:
            f.write(raw_text)
예제 #17
0
 def parseAnyFile():
     tika.initVM()
     parsed = parser.from_file("sample resume.docx")
     print(parsed["content"])
     print("------details after parsing resume--------")
     name=re.compile(r"([a-zA-Z]){3,}\s([a-zA-Z]){3,}")
     print("Name:"+re.search(name,parsed["content"]).group())
     ph=re.compile(r"(\d{10})")
     print("Phone Number:"+re.search(ph,parsed["content"]).group())
     mail = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}", re.IGNORECASE)
     print("Mail ID:"+re.search(mail,parsed["content"]).group())
     address =re.compile(r"[a-z\S0-9\S]+,\n+[a-zA-Z,\n]+[a-zA-Z.]")
     print("Address:"+re.search(address,parsed["content"]).group())
예제 #18
0
def extractPDFwithTika(arqs):
    """Using Apache Tika to extract PDF text - https://pypi.org/project/tika/
    Arguments:
        arqs {str} -- A list of filenames with path
    """
    #the time for load the Tika .jar server impact in first time of use
    tika.initVM()
    for arq in arqs:
        timeIni = perf_counter()
        textoCompleto = parser.from_file(arq)
        fileName = os.path.basename(arq)
        timeEnd = perf_counter()
        timeTotal = timeEnd - timeIni
        printMiniReport(textoCompleto["content"], fileName, "Tika", timeTotal)
        saveText(textoCompleto["content"], fileName, "Tika")
    print("--- Tika ---")
예제 #19
0
class TikaPreprocessor(DocPreprocessor):
    """
    This preprocessor use `Apache Tika <http://tika.apache.org>`_ parser to 
    retrieve text content from complex file types such as DOCX, HTML and PDFs.

    Documentation for customizing Tika is 
    `here <https://github.com/chrismattmann/tika-python>`_

    Example::

        !find pdf_dir -name *.pdf > input.csv # list of files
        from snorkel.parser import (
            TikaPreprocessor, CSVPathsPreprocessor, CorpusParser
        )
        CorpusParser().apply(
            CSVPathsPreprocessor('input.csv', parser_factory=TikaPreprocessor)
        )
    """
    # Tika is conditionally imported here
    import tika
    # automatically downloads tika jar and starts a JVM processif no REST API
    # is configured in ENV
    tika.initVM()
    from tika import parser as tk_parser
    parser = tk_parser

    def parse_file(self, fp, file_name):
        parsed = type(self).parser.from_file(fp)
        txt = parsed['content']
        name = os.path.basename(fp).rsplit('.', 1)[0]
        stable_id = self.get_stable_id(name)
        doc = Document(name=name,
                       stable_id=stable_id,
                       meta={'file_name': file_name})
        yield doc, txt
예제 #20
0
def run():
    """
    Starts Uforia.

    Sets up the database, modules, and then
    invokes the file_scanner.
    """

    recursive = rcontext.is_recursive
    if not recursive:
        print("Uforia starting...")

    db = database.Database(config)
    if not recursive:
        db.setup_main_table()
        db.setup_mimetypes_table()

    if config.ENABLEMODULES:
        if config.DEBUG:
            print("Detecting available modules...")
        uforiamodules = modules.Modules(config, db, rcontext)
        if not recursive:
            fill_mimetypes_table(uforiamodules)
    else:
        uforiamodules = ''

    # Start the JCC JVM runtime for Tika
    if not rcontext.jvm_initialized:
        import tika
        tika.initVM()
        rcontext.jvm_initialized = True

    if config.DEBUG:
        print("Starting producer...")
    if os.path.exists(config.STARTDIR):
        file_scanner(config.STARTDIR, uforiamodules, rcontext)
    else:
        print("The pathname " + config.STARTDIR +
              " does not exist, stopping...")

    if not recursive:
        print("\nUforia completed...\n")
예제 #21
0
def run():
    """
    Starts Uforia.

    Sets up the database, modules, and then
    invokes the file_scanner.
    """

    recursive = rcontext.is_recursive
    if not recursive:
        print("Uforia starting...")

    db = database.Database(config)
    if not recursive:
        db.setup_main_table()
        db.setup_mimetypes_table()

    if config.ENABLEMODULES:
        if config.DEBUG:
            print("Detecting available modules...")
        uforiamodules = modules.Modules(config, db, rcontext)
        if not recursive:
            fill_mimetypes_table(uforiamodules)
    else:
        uforiamodules = ''

    # Start the JCC JVM runtime for Tika
    if not rcontext.jvm_initialized:
        import tika
        tika.initVM()
	rcontext.jvm_initialized = True

    if config.DEBUG:
        print("Starting producer...")
    if os.path.exists(config.STARTDIR):
        file_scanner(config.STARTDIR, uforiamodules, rcontext)
    else:
        print("The pathname " + config.STARTDIR + " does not exist, stopping...")

    if not recursive:
        print("\nUforia completed...\n")
예제 #22
0
def getPDF(filename):
    import tika
    tika.initVM()
    from tika import parser
    parsed = parser.from_file(filename)
    return parsed["content"].split("\n")
예제 #23
0
from logging import info
from pathlib import Path

import tika
from web.datasets.services import get_s3_client
from django.conf import settings
from django.core.exceptions import ImproperlyConfigured
from dotenv import find_dotenv, load_dotenv
from dramatiq import actor, set_broker
from dramatiq.brokers.rabbitmq import RabbitmqBroker
from tika import parser

tika.initVM()  # noqa

# Esse bloco (feio) faz com que esse módulo funcione dentro ou fora do Django
try:
    from web.datasets.models import File
except ImproperlyConfigured:
    import configurations
    import os

    os.environ.setdefault("DJANGO_CONFIGURATION", "Dev")
    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "web.settings")
    load_dotenv(find_dotenv())
    configurations.setup()
    from web.datasets.models import File

rabbitmq_broker = RabbitmqBroker(url=settings.BROKER_URL)
set_broker(rabbitmq_broker)
client = get_s3_client(settings)
예제 #24
0
def textpdf(document):

    tika.initVM()
    parsed = parser.from_file(document.encode('utf-8'))
    return parsed["content"]
예제 #25
0
 def startVM(self,proxy_host,proxy_port,httpsProxy,no_proxy):
     args=self.__create_vm_args(proxy_host,proxy_port, httpsProxy,no_proxy)
     log.debug("Args for JVM: {}. Maxheap: {}".format(args,self.jvm_max_heap))
     vm=tika.initVM(maxheap=self.jvm_max_heap, vmargs=args)
     return vm
import sys
import os
import tika
tika.initVM()
from tika import parser
def read_html_file(fileop):
    parsed = parser.from_file(fileop)
    return(parsed["content"])

def read_training_file(filename):
    filewr = open("big.txt","w")
    HTMLDirs = next(os.walk(filename))[2]
    print(HTMLDirs)
    count=0
    for files in HTMLDirs:
        fileop = os.path.join(filename,files)
        text = read_html_file(fileop)
        print(text)
        filewr.write(text.encode('utf-8'))
        count = count+1
        print(count)

trainingfile = sys.argv[1]
read_training_file(trainingfile)     
 def test_tika(self):
     image = 'https://raw.githubusercontent.com/nmcteam/image-with-text/master/example/destination.jpg'
     tika.initVM()
     parsed = parser.from_file(image)
예제 #28
0
파일: appback.py 프로젝트: svalles/PFC1
def fileanalisis(f_in_tika):

    global patrones
    global busqueda_pln

    # Tabla resultados
    # Guarda un resumen de la sumatoria de hallazgos por cada tipo econtrado
    # Tiene el formato: Nombre,hash,impacto,cant.ocurrencias,impacto
    resultados = []

    # Tabla resultadodetalle
    # Guarda el detalle de cada item encontrado.
    # Tiene el formato: Tipo,dato
    resultadodetalle = []

    #################################
    #Parseo de archivo con Tika
    #################################
    tika.initVM()
    parsed = parser.from_file(f_in_tika)

    #Se extrae el contenido del archivo parseado. La otra opción es extraer los metadatos del archivo.
    doctika = parsed["content"]

    #Entrenamiento de Spacy el procesador de lenguaje natural NLP
    nlp = spacy.load('en_core_web_sm')
    # En el caso de querer probar con idioma español se debe usar la proxima linea.
    # nlp = spacy.load('es_core_new_sm')

    # Carga en la tabla resultados los elementos de "patrones". Convierto los nombres a hash
    # El impacto queda en cero ya que luego será calculado.
    for name, patron, impacto in patrones:
        resultados.append([name, nlp.vocab.strings[name], impacto, 0, 0])

    # Carga en la tabla resultados los elementos de "busqueda_pln"
    # El impacto queda en cero ya que luego será calculado.
    for name, hash, impacto in busqueda_pln:
        resultados.append([name, hash, impacto, 0, 0])

    print("\nNombre de archivo a analizar", f_in_tika)

    #########################################################################
    #Procesamiento usando Spacy de la cadena de caracteres entregada por Tika
    #########################################################################
    doc = nlp(parsed["content"])

    # Crea el objeto con todos los match
    matcher_obj = Matcher(nlp.vocab)

    #########################################################
    # busqueda_pln por expresiones Regulares (lista patrones)
    #########################################################
    #agrego todos los patrones a buscar	al objeto para que Spacy pueda buscar expresiones regulares.
    for nombre, pat, impacto in patrones:
        matcher_obj.add(nombre, None, pat)

    #Se realiza la busqueda
    #Guarda en la lista "Coincidencias" todos los match de las expresiones regulares.
    #El formato de la lista es hash,start,end (en el documento)
    coincidencias = matcher_obj(doc)

    #Recorre la lista de objetos encontrados y matcheo con la tabla de resultados usando el hash como id.
    #Por cada hallazgo aumenta en 1 el campo cant_ocurrencias
    for var in range(len(coincidencias)):
        hash = coincidencias[var][0]
        for index in range(len(resultados)):
            if hash == resultados[index][1]:
                resultadodetalle.append([
                    resultados[index][0],
                    doc[coincidencias[var][1]:coincidencias[var][2]].text
                ])
                resultados[index][3] += 1

    #Quita Duplicados
    resultadodetalle = removeDuplicates(resultadodetalle)

    ######################################################################
    # busqueda_pln por NLP de Spacy, usando Named Entity Recognition (NER)
    ######################################################################
    # Entidades a buscar con nombre,hash (todos en 0), impacto

    #Lista de entidades econtradas
    entidades = []

    #Imprime las entidades encontradas según lo especificado en búsqueda y agrego los hallazgos a "Entidades"
    for ent in doc.ents:
        for index in range(len(busqueda_pln)):
            if ent.label_ == busqueda_pln[index][0]:
                entidades.append([ent.label_, ent.text])
                resultadodetalle.append([ent.label_, ent.text])

    #Quita Duplicados
    entidades = removeDuplicates(entidades)

    print("\nEntidades\n")
    print(entidades)

    #Imprime los resultados en detalle ordenados por tipo de evento
    print("\nHallazgos en detalle")
    resultadodetalle = sorted(resultadodetalle,
                              key=lambda item: item[0],
                              reverse=False)
    for nombre, detalle in resultadodetalle:
        print(nombre, detalle)

    #Vuelve a recorrer todas las entidades econtradas y por cada hallazgo suma 1 a la tabla de resultados
    for ente in range(len(entidades)):
        tipo = entidades[ente][0]
        for index in range(len(resultados)):
            if tipo == resultados[index][0]:
                resultados[index][3] += 1

    ###############################################################################################
    #Calculo el riesgo del archivo por medio de la formula Riesgo=Impacto * cantidad de ocurrencias
    ###############################################################################################
    for resul in range(len(resultados)):
        resultados[resul][4] = resultados[resul][2] * resultados[resul][3]

    #Ordena la lista por la columna peso y la vuelve a imprimir
    resultados = sorted(resultados, key=lambda item: item[4], reverse=True)

    riesgoarchivo = 0

    #Imprime los resultados finales estadisticos
    print("\nTABLA DE RESULTADOS\n")
    print("\nTIPO - IMPACTO - CANTIDAD - RIESGO\n")
    for linea in range(len(resultados)):
        print(resultados[linea][0], resultados[linea][2], resultados[linea][3],
              resultados[linea][4])
        riesgoarchivo += resultados[linea][4]
    print("\nRiesgo de archivo:", riesgoarchivo)

    #Chequea la cantidad de hallazgos
    print('Total de matcheos en el documento:', len(resultados))
    return riesgoarchivo, resultados, resultadodetalle, doctika
 def __init__(self):
     print('initializing pdf reader...')
     tika.initVM()
     print('done.')
예제 #30
0
def getpdf(filename):
    
    # print("텍스트 파일을 추출할 PDF 파일명을 입력하세요.")
    tmp3 = settings.BASE_DIR / 'images' / filename
    tmp3=str(tmp3)
    print(tmp3,'%#$')
    # PDFfileName = 'documents/' + input() + '.pdf'

    # PDF를 열고, interpreter, pages 변수를 가져온다.
    device, interpreter, pages = pdfopen(tmp3)
    if device == -1 and interpreter == -1 and pages == -1:
        print("PDF 파일을 잘못 입력했습니다.")
        exit()

    # PDF를 읽고, test_list를 가져오고, title을 가져오고, 띄어쓰기를 교정하며, 가장 많이 사용한 텍스트 크기를 반환한다.
    text_list, textfont_list, textmiddle_list, title_num, title_data, image_name, image_list, textmiddle_average, textfont_average, char_list = pdfread(device, interpreter, pages, filename)
    title_data = title_return(title_data).strip()
    print(title_data)

    translator = Translator()
    translator_cnt = 0
    print(char_list,'$#@')
    if len(char_list) > 0:
        # print("논문 형식에 따라, 논문 전체 내용을 요약합니다.")

        print_result = "논문 내용\n"
        # 맞춤법을 교정한다.
        # print("맞춤법 교정 시작!")
        result = char_list.strip().split(".")
        final_result = ""
        translate_result = ""
        for y in range(len(result)):
            # print("맞춤법 교정 중.... " + str(round((y+1) / (len(result)+1) * 100, 2)) + "%")
            if len(result[y]) > 0:
                try:
                    temp = spell_checker.check(result[y] + '.')
                    final_result += temp.as_dict()['checked']
                    print_result += temp.as_dict()['checked'] + "\n"
                    if translator_cnt == 0:
                        if translator.translate(temp.as_dict()['checked']).src == 'ko':
                            translator_cnt = 1
                        elif translator.translate(temp.as_dict()['checked']).src == 'en':
                            translate_result += translator.translate(temp.as_dict()['checked'], dest='ko').text
                except:
                    final_result += result[y] + ". "
                    print_result += result[y] + ".\n"
                    if translator_cnt == 0:
                        if translator.translate(result[y]).src == 'ko':
                            translator_cnt = 1
                        elif translator.translate(result[y]).src == 'en':
                            translate_result += translator.translate(result[y], dest='ko').text
        # print("맞춤법 교정 완료!")
        # print("")

    else:
        try:
            # KCI 사이트에서 관련 정보를 가져온다.
            # print("PDF 논문 분석 중...")
            link_data, title_data_ko, title_data_en, title_data_plus1, title_data_plus2, journalInfo1, journalInfo2, journalInfo3, name1, name2, content1, content2, content3, content4, reference = crawling_setting(title_data)
            # print("PDF 논문 분석 완료!")
            # print("")
        except:
            link_data = -1
    
        text_list = list_return(text_list)
        collect_loc = maxsize_return(text_list, textfont_list)
        # print(collect_loc)

        # 다단 나누고, 같은 글자 크기끼리 리스트를 합친다.
        text_list, textfont_list, figure_name, figure_list = pdfsort(text_list, textfont_list, textmiddle_list, textmiddle_average, textfont_average)
        text_list, textfont_list = pdfgrap(text_list, textfont_list)
        text_list, textfont_list = pdfcutter(text_list, textfont_list, title_num, collect_loc)

        # 관련 텍스트를 전부 합친다.
        result = ""
        print_result = ""
        for y in range(len(text_list)):
            result += text_list[y] + " "

        if link_data == -1:
            pass
            print("KCI에 등록되어 있지 않은 논문이거나 사이트 액세스 오류입니다.")
        else:
            # 관련 정보를 추가한다.
            print_result = "링크 : " + link_data + "\n\n"
            print_result += "논문 제목(한글) : " + title_data_ko + "\n\n"
            print_result += "논문 제목(영어) : " + title_data_en + "\n\n"
            print_result += "피인용 횟수 : " + str(title_data_plus1) + "\n\n"
            print_result += "열람 횟수 : " + str(title_data_plus2) + "\n\n"
            print_result += "학술지 : " + journalInfo1 + "\n\n"
            print_result += "논문정보 : " + journalInfo2 + "\n\n"
            print_result += "발행기관 : " + journalInfo3 + "\n\n"
            
            print_result += "저자 정보\n"
            for x in range(len(name1)):
                print_result += str(x) + " : " + name1[x] + " (" + name2[x] + ")\n"
            print_result += "\n"

            print_result += "논문 초록\n"
            print_result += content1 + "\n\n"
            print_result += content2 + "\n\n"

            print_result += "키워드\n"
            if len(content3) == len(content4):
                for x in range(len(content3)):
                    print_result += str(x) + " : " + content3[x] + " (" + content4[x] + ")\n"
            else:
                for x in range(len(content3)):
                    print_result += str(x) + " : " + content3[x] + "\n"
            print_result += "\n"

            if len(reference) > 0:
                print_result += "참고 문헌\n"
                for x in range(len(reference)):
                    print_result += reference[x] + "\n"
                print_result += "\n"

        # 그림 데이터를 정제한다.
        figure_image_name = []
        figure_image_src = []
        if len(figure_name) > 0:
            max_cnt = max(image_list)
            max_list = []
            count_list = []
            for x in range(max_cnt+1):
                max_list.append(image_list.count(x))
                count_list.append(0)

                if x >= 1:
                    max_list[x] += max_list[x-1]
            # print(max_list)
            # print(count_list)

            for x in range(len(figure_name)):
                if (count_list[figure_list[x] - 1] + max_list[figure_list[x] - 1]) < max_list[figure_list[x]]:
                    if image_name[(count_list[figure_list[x] - 1] + max_list[figure_list[x] - 1])].count('No Image') == 0:
                        figure_image_name.append(figure_name[x])
                        figure_image_src.append("images/" + image_name[(count_list[figure_list[x] - 1] + max_list[figure_list[x] - 1])])
                    count_list[figure_list[x] - 1] += 1

            if len(figure_image_name) > 0:
                print_result += "그림\n"
                for x in range(len(figure_image_name)):
                    print_result += figure_image_name[x] + " " + figure_image_src[x] + "\n"
                print_result += "\n"

        print_result += "논문 내용\n"
        # 맞춤법을 교정한다.
        # print("맞춤법 교정 시작!")
        result = result.strip().split(".")
        final_result = ""
        translate_result = ""
        for y in range(len(result)):
            # print("맞춤법 교정 중.... " + str(round((y+1) / (len(result)+1) * 100, 2)) + "%")
            if len(result[y]) > 0:
                try:
                    temp = spell_checker.check(result[y] + '.')
                    final_result += temp.as_dict()['checked']
                    print_result += temp.as_dict()['checked'] + "\n"
                    if translator_cnt == 0:
                        if translator.translate(temp.as_dict()['checked']).src == 'ko':
                            translator_cnt = 1
                            translate_result += temp.as_dict()['checked']
                        elif translator.translate(temp.as_dict()['checked']).src == 'en':
                            translate_result += translator.translate(temp.as_dict()['checked'], dest='ko').text
                    else:
                        translate_result += temp.as_dict()['checked']
                except:
                    final_result += result[y] + ". "
                    print_result += result[y] + ".\n"
                    if translator_cnt == 0:
                        if translator.translate(result[y]).src == 'ko':
                            translator_cnt = 1
                            translate_result += result[y] + ". "
                        elif translator.translate(result[y]).src == 'en':
                            translate_result += translator.translate(result[y], dest='ko').text
                    else:
                        translate_result += result[y] + ". "
        # print("맞춤법 교정 완료!")
        # print("")

    if len(final_result) < 100:
        print("논문 내용이 뽑히지 않아 다시 진행중...")
        import tika
        tika.initVM()
        from tika import parser
        parsed = parser.from_file(tmp3)
        temp = parsed["content"]
        temp = temp.replace('\n', '')

        print_result += temp + "\n"
        final_result = temp
        result = temp.strip().split(".")
        final_result = ""
        translate_result = ""
        for y in range(len(result)):
            if translator.translate(result[y]).src == 'ko':
                translate_result = temp
                break

            # print("번역 중.... " + str(round((y+1) / (len(result)+1) * 100, 2)) + "%")
            translate_result += translator.translate(result[y], dest='ko').text + ". "
        # print("추출 완료!")

    # 요약서비스를 이용한다
    # print("요약 서비스 시작!")
    summarize_data = lexlank_function(final_result)
    summarize_result = "본문 요약 (10줄)\n"
    for x in range(len(summarize_data)):
        try:
            if translator.translate(summarize_data[x]).src == 'en':
                summarize_result += "원문 : " + summarize_data[x] + "\n"
                summarize_result += "번역 : " + translator.translate(summarize_data[x], dest='ko').text + "\n\n"
            else:
                summarize_result += summarize_data[x] + "\n\n"
        except:
            summarize_result += summarize_data[x] + "\n\n"
    
    # print("요약 완료!")
    # print("")

    # print(translate_result)
    # print("키워드 추출 시작!")
    summarize_tags = keywords_function(translate_result)
    # print(summarize_tags)
    # visualize_function(PDFpathName, summarize_tags)
    # print("키워드 추출 완료!")
    # print("")
    output_name='res'+filename +'.txt'
    fileIn = open(settings.BASE_DIR / 'reports/algo/outputs'/ output_name, 'wt', encoding='utf-8')
    print(print_result, file=fileIn)
    fileIn.close()

    output1_name='final_'+filename +'.txt'
    fileOut = open(settings.BASE_DIR / 'reports/algo/outputs'/ output1_name, 'wt', encoding='utf-8')
    print(final_result, file=fileOut)
    fileOut.close()

    output2_name='summarize_'+filename +'.txt'
    fileOut = open(settings.BASE_DIR / 'reports/algo/outputs'/ output2_name, 'wt', encoding='utf-8')
    print(title_data+';^'+summarize_result, file=fileOut)
    fileOut.close()
    
    output3_name='tag_'+filename +'.txt'
    fileOut = open(settings.BASE_DIR / 'reports/algo/outputs'/ output3_name, 'wt', encoding='utf-8')
    print(summarize_tags, file=fileOut)
    fileOut.close()
    # fileOut = open('outputs/output2_' + filename +'.txt', 'wt', encoding='utf-8')
    # print(summarize_result, file=fileOut)
    # fileOut.close()

    # print(final_result,title_data,summarize_data,summarize_tags)
    # print( title_data,'#',summarize_data[0])
    print("프로그램 완료! 종료하겠습니다.")
    print("")
예제 #31
0
 def __init__(self):
     tika.initVM()
     nltk.download('punkt')
예제 #32
0
파일: convtexto.py 프로젝트: senen2/juri
def pdf(filename):
    tika.initVM()
    file = parser.from_file(filename)
    texto = file['content']
    texto = texto.replace('\n\n', '¬').replace('\n', '').replace('¬', '\n\n')
    return texto
예제 #33
0
파일: Parser.py 프로젝트: pshg6899/red
    def get_xml(self, ocr, tikaUrl, path, image_save, save_path, fullname, pdf_save):
        tika.initVM()
        tika.TikaClientOnly = True
        os.environ['no_proxy'] = '*'

        name1 = fullname.split('/')
        name2 = name1[-1].replace('.pdf', '')
        name2 = name2 + "_ocr" + ".pdf"
        name3 = name1[-2] + '/' + name2
        parsed = parser.from_file(path, tikaUrl, xmlContent=True)
        xml2 = parsed["content"]
        xml2 = xml2.split('<div class="page">')
        convert_ocr_path = ""
        xml = ""

        if ocr == "true":

            #if not os.path.exists(save_path + name3):
            pages = convert_from_path(path, 450)
            image_counter = 1


            os.mkdir(image_save)
            os.mkdir(pdf_save)
            for page in pages:
                filename = "page_" + str(image_counter) + ".jpg"

                page.save(image_save+"/" + filename, 'JPEG')

                image_counter = image_counter + 1
            filelimit = image_counter - 1
            pageList = []
            for i in range(1, filelimit + 1):
                filename = "page_" + str(i) + ".jpg"
                filepdfname = "page_" + str(i) + ".pdf"

                pdf = pytesseract.image_to_pdf_or_hocr(image_save +"/" + filename, extension='pdf', config='-psm 6')
                with open(pdf_save+"/" + filepdfname, 'a+b') as f:
                    f.write(pdf)
            shutil.rmtree(image_save)

            merger = PdfFileMerger()
            path = pdf_save

            for i in range(1, filelimit + 1):
                filepdfname = "page_" + str(i) + ".pdf"
                merger.append(pdf_save +"/" + filepdfname)

            shutil.rmtree(pdf_save)

            merger.write(save_path + name3)
            merger.close()
            parsed = parser.from_file(save_path + name3, tikaUrl, xmlContent=True)
            convert_ocr_path=save_path + name3
            xml = parsed["content"]


        else:

            if xml2[2] == "<p />\n</div>\n":

                if not os.path.exists(save_path + name3):

                    pages = convert_from_path(path, 450)
                    image_counter = 1

                    os.mkdir(image_save)
                    os.mkdir(pdf_save)
                    for page in pages:
                        filename = "page_" + str(image_counter) + ".jpg"
                        page.save(image_save+'/' + filename, 'JPEG')
                        image_counter = image_counter + 1
                    filelimit = image_counter - 1
                    pageList = []
                    for i in range(1, filelimit + 1):
                        filename = "page_" + str(i) + ".jpg"
                        filepdfname = "page_" + str(i) + ".pdf"
                        pdf = pytesseract.image_to_pdf_or_hocr(image_save +'/'+ filename, extension='pdf', config='-psm 6' )
                        with open(pdf_save +"/" + filepdfname, 'a+b') as f:
                            f.write(pdf)
                    shutil.rmtree(image_save)

                    merger = PdfFileMerger()
                    path = pdf_save

                    for i in range(1, filelimit + 1):
                        filepdfname = "page_" + str(i) + ".pdf"
                        merger.append(pdf_save +"/" + filepdfname)

                    shutil.rmtree(pdf_save)

                    merger.write(save_path + name3)
                    merger.close()
                    parsed = parser.from_file(save_path + name3, tikaUrl, xmlContent=True)
                    convert_ocr_path = save_path + name3
                    xml = parsed["content"]
                else:
                    parsed = parser.from_file(save_path + name3, tikaUrl, xmlContent=True)
                    xml = parsed["content"]
            else:
                xml = parsed["content"]
        for i in rule['REPLACE']:
            xml = xml.replace(i[0], i[1])

        return xml, convert_ocr_path
예제 #34
0
# tika 서버 초기화 하기
import tika
tika.initVM()

from tika import parser

print("텍스트 파일을 추출할 PDF파일명을 입력하세요.")
PDFfileName = 'documents/' + input() + '.pdf'

inputpath = PDFfileName
parsed = parser.from_file(PDFfileName)
temp = parsed["content"]

fileOut = open('output.txt', 'w', encoding='utf-8')
print(temp, file=fileOut)
fileOut.close()
예제 #35
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# 
# $Id$
#
# Author: mattmann
# Description: TBD

import json
import sys
import getopt
import tika
tika.initVM(tika.CLASSPATH)

_verbose = False
_helpMessage = '''
Usage: translate [-v] [-c column headers file] [-i input json file] [-j output json file] [-p cred file] [-f from] [-t to]

Options:
-i input json file --injson=file
    The input named JSON file.
-j json file --json=file
    Output the named JSON file.
-c column headers file --cols=file
    Use the provided column headers to parse the TSV and to name fields in the JSON.
-f from language --from=2 letter language code
    The 2 letter code of the language to translate from.
-t to language --to=2 letter language code
예제 #36
0
파일: doc2txt.py 프로젝트: wolfwhoami/xxxxx
 def init():
     tika.initVM()
예제 #37
0
pacotes necessários:
    pip install python-docx
    pip install tika

@author: PAULO.GFERREIRA
"""
"""------ Pacotes ------"""
import unicodedata, re, os
from docx import Document
from datetime import datetime

# Tika é o pacote para importação em qualquer formato
import tika
#No windows é necessário inicializar a VM de java
if os.name == 'nt': tika.initVM()
from tika import parser
"""------ Classes ------"""


# Objetos alterações encotradas
class Alteracoes_obj:
    #instancias_criadas = []
    def __init__(self,
                 ind_original=None,
                 ind_novo=None,
                 simi_difflib=None,
                 simi_bow=None,
                 tipo=None):
        self.ind_original = ind_original
        self.ind_novo = ind_novo
예제 #38
0
 def __init__(self):
     tika.initVM()