/
reqboxfixcrlf.py
107 lines (94 loc) · 3.57 KB
/
reqboxfixcrlf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# -*- coding: utf-8 -*-
# Project: SIGA
# Component Name: reqboxfixcrlf
# Language: Python
#
# License: GNU Public License
# This file is part of the project.
# This is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
# without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# <http://www.gnu.org/licenses/>
#
# Author: Albert De La Fuente (www.albertdelafuente.com)
# E-Mail: http://www.google.com/recaptcha/mailhide/d?k=01eb_9W_IYJ4Pm_Y9ALRIPug==&c=L15IEH_kstH8WRWfqnRyeW4IDQuZPzNDRB0KCzMTbHQ=
#
# Description: This script it is a format fixing tool.
# MS files sometimes have '\r' character only instead of '\r\n'. This tool
# fixes those cases for properly parsing the file
#
# Limitations: Error handling is not correctly implemented, time constraints
# The code is not clean and elegant as it should, again, time constraints
# Database tables used: None
# Thread Safe: No
# Extendable: No
# Platform Dependencies: Linux (openSUSE used)
# Compiler Options:
"""
Fixes *\r* patterns from MS Word files, converting them to *\r\n*
Command Line Usage:
reqboxfixcrlf {<option> <argument>}
Options:
-i, --input Input file
-o, --output Output file
Examples:
"""
import logging, sys, mmap, shutil, contextlib, codecs, re
from vlog import vlogger
VERB_NON = 0
VERB_MIN = 1
VERB_MED = 2
VERB_MAX = 3
def utf8(s):
return s.encode('utf-8')
def fixcrlf(sourcefile, destinationfile):
vlog = vlogger(2, sys.stdout)
vlog(VERB_MIN, "Inputfile: %s" % sourcefile)
print("1")
sf = codecs.open(sourcefile, encoding='latin-1', mode='r') # open(filename, 'r')
#sf = open(sourcefile, 'r')
sm = mmap.mmap(sf.fileno(), 0, access=mmap.ACCESS_READ)
#df = codecs.open(destinationfile, encoding='utf-8', mode='wb') # open(filename, 'r')
df = open(destinationfile, 'wb')
#dm = mmap.mmap(df.fileno(), 0)#, access=mmap.ACCESS_DEFAULT)
loc = 0
end = sm.size()
terminator = b'\r\n'
linecounter = 0
totallines = 0
while loc < end:
fileline = sm.readline()
vlog(VERB_MED, "parsing line: %d" % linecounter)
# fileline = fileline.encode('utf-8')
loc = loc + len(fileline)
count = fileline.count(b'\r')
if count == 1:
print(type(fileline))
df.write(fileline)
totallines += 1
else:
vlog(VERB_MED, "FIXING multiline: %d lines on source line %d" % (count, linecounter))
fileline = fileline.replace(b'\r', b'\r\n')
df.write(fileline)
totallines += count
linecounter += 1
vlog(VERB_MIN, "Inputfile: %s" % sourcefile)
vlog(VERB_MIN, " * Total number of lines: %d" % linecounter)
vlog(VERB_MIN, "Outfile: %s" % destinationfile)
vlog(VERB_MIN, " * Total number of lines: %d" % totallines)
def fixdoublecrlf(sourcefile, destinationfile):
# HEX pattern to match: 0D0A0A0D0A
# Idea: just remove 0A0D0A
pass
def main(argv):
if len(argv) == 3:
fixcrlf(argv[1], argv[2] + ".tmp")
fixdoublecrlf(argv[2] + ".tmp", argv[2])
if __name__ == "__main__":
sys.exit(main(sys.argv))