Пример #1
0
def file_convert_to_txt_if_necessary(Prg, FileOrigAbsPath,
                                     Converted__FileBaseNames__OrigNames):
    BaseNameNoExt, ExtensionLow = util.basename_without_extension__ext(
        FileOrigAbsPath, ExtensionLower=True)
    if ExtensionLow in ExtensionsConvertable:

        FilePathConvertedToText = util.filename_without_extension(
            FileOrigAbsPath) + ".txt"
        Converted__FileBaseNames__OrigNames[BaseNameNoExt] = FileOrigAbsPath

        if not os.path.isfile(
                FilePathConvertedToText):  # convert if it's necessary

            if ExtensionLow == ".pdf":
                Converter = Prg["ConverterPdfToText"]

            if ExtensionLow == ".htm" or ExtensionLow == ".html":
                Converter = Prg["ConverterHtmlToText"]

            if Converter(Prg, FileOrigAbsPath, FilePathConvertedToText):
                info("Successful conversion to txt: " + FileOrigAbsPath)
            else:
                ConversionErrorMsg = f"Error, file conversion: {FileOrigAbsPath}"
                util.log(Prg, ConversionErrorMsg)
                info(ConversionErrorMsg)
Пример #2
0
def extract_all_labels(filenames, out_filepath=DATA_FOLDER+'labels.p', chunk_size=2000):
    print "EXTRACTING ALL LABELS INTO {0}".format(out_filepath)
    all_labels = []
    label_dict = {}

    filenames_chunks = util.chunks(filenames, chunk_size)

    for i, chunk in enumerate(filenames_chunks):
        pool = Pool(processes=util.CPU_COUNT)
        chunk_labels = pool.map(extract_labels, chunk)
        pool.close()

        for filepath, labels in zip(chunk, chunk_labels):
            if labels is not None:
                file_id = util.filename_without_extension(filepath)
                label_dict[file_id] = labels
                all_labels += labels

        print i+1, '/', len(filenames_chunks)

    #Write labels to file
    with open(out_filepath,'w') as f:
        pickle.dump(label_dict, f)

    print '\nLabels:'
    print len(set(all_labels))
    print Counter(all_labels)
Пример #3
0
def frog_process_files(files, verbose=True):
    seen = []
    start_time = time.time()

    frogger = frog.Frog(frog.FrogOptions(parser=False,mwu=False,ner=False,morph=False,chunking=False, numThreads=8),'/etc/frog/frog.cfg')

    for i, filename in enumerate(files):
        with open(filename,'r') as in_file:
            output = frogger.process_raw(in_file.read())

        if verbose:
            print ('> PROCESSING', filename, str(len(seen))+'/'+str(len(files)))

            seen.append(filename)

            #Timings (estimation of time remaining)
            runtime = time.time() - start_time
            per_document_time = runtime/len(seen)
            remaining_time = (len(files)-len(seen))*per_document_time
            total_time = remaining_time+runtime

            print ("RUNTIME", duration_to_string(runtime),
             "("+duration_to_string(per_document_time)+")",
              'REMAINING', duration_to_string(remaining_time),
               'TOTAL', duration_to_string(total_time))

        frogged_filename = util.filename_without_extension(filename, '.txt')

        with open(OUTPUT_FOLDER+frogged_filename+'.frog.out', 'w') as f:
            f.write(output)
Пример #4
0
def extract_all_labels(filenames,
                       out_filepath=DATA_FOLDER + 'labels.p',
                       chunk_size=2000):
    print "EXTRACTING ALL LABELS INTO {0}".format(out_filepath)
    all_labels = []
    label_dict = {}

    filenames_chunks = util.chunks(filenames, chunk_size)

    for i, chunk in enumerate(filenames_chunks):
        pool = Pool(processes=util.CPU_COUNT)
        chunk_labels = pool.map(extract_labels, chunk)
        pool.close()

        for filepath, labels in zip(chunk, chunk_labels):
            if labels is not None:
                file_id = util.filename_without_extension(filepath)
                label_dict[file_id] = labels
                all_labels += labels

        print i + 1, '/', len(filenames_chunks)

    #Write labels to file
    with open(out_filepath, 'w') as f:
        pickle.dump(label_dict, f)

    print '\nLabels:'
    print len(set(all_labels))
    print Counter(all_labels)
Пример #5
0
    def test_util_filename_extension__without_extension(self):
        if self._test_exec("test_filename_extension__without_extension"):
            Ext = util.filename_extension("file.py")
            self.assertEqual(Ext, ".py")

            Ext = util.filename_extension("noext")
            self.assertEqual(Ext, "")

            FnameOnly = util.filename_without_extension("file.py")
            self.assertEqual(FnameOnly, "file")
Пример #6
0
    def test_collect_docs_from_working_dir(self):
        if self._test_exec("test_collect_docs_from_working_dir"):
            Prg = self.Prg

            FileName = "test_file_document_example.txt"
            FilePath = os.path.join(Prg["DirDocuments"], FileName)
            util.file_del(FilePath)
            util.file_write(Prg, Fname=FilePath, Content="example text")
            DocumentsAvailable = document.document_objects_collect_from_dir_documents(
                Prg)

            self.assertIn(util.filename_without_extension(FileName),
                          DocumentsAvailable)
            util.file_del(FilePath)
Пример #7
0
def extract_plaintext(filepath, outpath):
    with open(filepath) as fd:

        #Filename without extension
        file_id = util.filename_without_extension(filepath)
        plain_text = []

        try:
            obj = xmltodict.parse(fd.read())
            root = obj['open-rechtspraak']
            metadata = root['rdf:RDF']

            #############
            # Extract content as plain text
            #############

            if 'inhoudsindicatie' in root:
                summary = root['inhoudsindicatie']
                as_plain_text(summary, plain_text)

            if 'uitspraak' in root:
                content = root['uitspraak']
                as_plain_text(content, plain_text)

            if 'conclusie' in root:
                content = root['conclusie']
                as_plain_text(content, plain_text)

            #print filepath
            #Write to outfile
            with codecs.open(outpath+file_id+'.txt', 'w', 'utf-8') as f:
                 for line in plain_text:
                        print>>f, line

        except KeyError, e:
            #Skip silently

            #print "ERROR:", sys.exc_info()[0]
            #print "Skipping faulty:", filepath
            #print "file_id:", file_id
            return
        except:
Пример #8
0
def extract_plaintext(filepath, outpath):
    with open(filepath) as fd:

        #Filename without extension
        file_id = util.filename_without_extension(filepath)
        plain_text = []

        try:
            obj = xmltodict.parse(fd.read())
            root = obj['open-rechtspraak']
            metadata = root['rdf:RDF']

            #############
            # Extract content as plain text
            #############

            if 'inhoudsindicatie' in root:
                summary = root['inhoudsindicatie']
                as_plain_text(summary, plain_text)

            if 'uitspraak' in root:
                content = root['uitspraak']
                as_plain_text(content, plain_text)

            if 'conclusie' in root:
                content = root['conclusie']
                as_plain_text(content, plain_text)

            #print filepath
            #Write to outfile
            with codecs.open(outpath + file_id + '.txt', 'w', 'utf-8') as f:
                for line in plain_text:
                    print >> f, line

        except KeyError, e:
            #Skip silently

            #print "ERROR:", sys.exc_info()[0]
            #print "Skipping faulty:", filepath
            #print "file_id:", file_id
            return
        except:
Пример #9
0
def extract_labels(filepath):
    with open(filepath) as fd:

        #Filename without extension
        file_id = util.filename_without_extension(filepath)

        try:
            obj = xmltodict.parse(fd.read())
            root = obj['open-rechtspraak']
            metadata = root['rdf:RDF']

            #############
            # Extract labels
            #############

            description = metadata['rdf:Description']

            if type(description) is list:
                description = description[0]
            law_areas = description['dcterms:subject']

            if type(law_areas) is not list:
                law_areas = [law_areas]

            text_labels = []
            for x in law_areas:
                labels = x['#text'].split('; ')
                text_labels += labels

            return text_labels

        except KeyError, e:
            #Skip silently

            #print "ERROR:", sys.exc_info()[0]
            #print "Skipping faulty:", filepath
            #print "file_id:", file_id
            return
        except:
Пример #10
0
def extract_labels(filepath):
    with open(filepath) as fd:

        #Filename without extension
        file_id = util.filename_without_extension(filepath)

        try:
            obj = xmltodict.parse(fd.read())
            root = obj['open-rechtspraak']
            metadata = root['rdf:RDF']

            #############
            # Extract labels
            #############

            description = metadata['rdf:Description']

            if type(description) is list:
                description = description[0]
            law_areas = description['dcterms:subject']

            if type(law_areas) is not list:
                law_areas = [law_areas]

            text_labels = []
            for x in law_areas:
                labels = x['#text'].split('; ')
                text_labels += labels

            return text_labels

        except KeyError, e:
            #Skip silently

            #print "ERROR:", sys.exc_info()[0]
            #print "Skipping faulty:", filepath
            #print "file_id:", file_id
            return
        except:
Пример #11
0
def filter_and_lemma(chunk_size=2000):
    files = glob.glob(INPUT_FOLDER+'*.frog.out')

    lemmatized = {}

    #Split all files in the list into chunks
    file_chunks = util.chunks(files, chunk_size)

    for i, chunk in enumerate(tqdm(file_chunks)):
        pool = Pool(processes=util.CPU_COUNT)
        filtered_lemmatized = pool.map(process, chunk)
        pool.close()

        for filename, value in zip(chunk, filtered_lemmatized):
            file_id = util.filename_without_extension(filename, '.frog.out')
            lemmatized[file_id] = value

    #Order by key
    ordered = OrderedDict(sorted(lemmatized.items()))

    with open(DATA_FOLDER+'processed.p','w') as f:
        pickle.dump(ordered,f)
    print "Done!"
Пример #12
0
def doc_objects_delete__file_abspath(Prg, FileAbsPathWithExt):
    BaseName = os.path.basename(FileAbsPathWithExt)
    BaseNameNoExt = util.filename_without_extension(BaseName)
    doc_objects_delete__basename(Prg, BaseNameNoExt)
    util.file_del(FileAbsPathWithExt
                  )  # del orig file in every case, if DocObj doesn't exis