Exemplo n.º 1
0
def load_corpus(directory):
    texts = {}
    docs = {}
    for f in os.listdir(directory):
        print 'Loading: ', directory + f
        if f.endswith("txt8"):
            with codecs.open(directory + f, 'r', 'ascii', 'ignore') as text:
                texts[f[:-1]] = text.read()
        elif f.endswith('docx'):
            d = docx.clean(docx.opendocx(directory + f))
            # converts to nltk text object
            docs[f] = flatten(docx.getdocumenttext(d))
    return texts, docs
Exemplo n.º 2
0
                'color': 'auto',
                'space': 0,
                'sz': 6,
                'val': 'single',
            },
        },
        'celstyle': [
            {'align': 'center'},
            {'align': 'left'},
            {'align': 'right'},
        ],
        'headstyle': { 'fill':'C6D9F1', 'themeFill':None, 'themeFillTint':None },
    })
    
    # Cleaning
    docbody = docx.clean(docbody)

# ------------------------------
# Save output
# ------------------------------

# Prepare output file
outfile = zipfile.ZipFile('out.docx',mode='w',compression=zipfile.ZIP_DEFLATED)

# Copy unmodified sections
for f in template.namelist():
    if not f in map(lambda i: i[0], actlist):
        fo = template.open(f,'rU')
        data = fo.read()
        outfile.writestr(f,data)
        fo.close()