Пример #1
0
def removesmallchar():
    basepath = bf.getdrt()

    for root, dirs, files in os.walk(basepath):
        for file in files:
            infile = open(os.path.join(root, file), 'r')
            lines = 0
            words = 0
            characters = 0
            for line in infile:
                line = line.strip(os.linesep)
                wordslist = line.split()
                lines = lines + 1
                words = words + len(wordslist)
                characters = characters + len(line)
            '''print(file)
             print(lines)
             print(words)
             print(characters)
             print('-----')'''
            infile.close()
            if characters < 355:
                os.remove(os.path.join(root, file))
                print('removed file: ' + os.path.join(root, file))
Пример #2
0
import sys
import os
sys.path.insert(1,os.path.join(os.getcwd(),'dependencies')) 
import buildfolder as bf
import shutil
# from shutil import copyfile

# Change basepath if applicable
#basepath = "C:\\Users\\AYuen\\Environmental Protection Agency (EPA)\\ECMS - Documents\\Categorization Data\\"
basepath = bf.getdrt()
#copypath = "C:\\Users\\AYuen\\Environmental Protection Agency (EPA)\\ECMS - Documents\\newfiles\\"
copypath = bf.getdrt() + '\\'

# Get all files in the directory
qq = []

# Check for unwanted file extensions
filterout = ['aiff','arc','asc','avi','bwf','csi','dbf','ddf','dht','dng','dpx','dqt','e00','ebcdic','flac','gdb','gml','ics','jfif','kml','mbox','mov','mp3','mpeg2','mpeg4','mxf','prc','pst','shp','shx','step','u3d','utf16','utf8','warc','wave','wmv','x3d','x3dv']

for (root, dirs, files) in os.walk(basepath, topdown=False):
    if len(files) > 0:
        for file in files:
            qq.append(os.path.join(root,file))

for filename in qq:
    # Get the filename
    file = filename.split('\\')[-1]
    # Get the file extension
    fileext = filename.split(".")[-1]
    # For ezEmail - email content, if file is in the pdf folder extract record ID from end of filename
    if 'pdf' in filename and 'attachment' not in filename and fileext.lower() not in filterout:
Пример #3
0
from rake_nltk import Rake
from bs4 import BeautifulSoup
import sys
import os
sys.path.insert(1, os.path.join(os.getcwd(), 'dependencies'))
import buildfolder as bf
import datetime
import xlsxwriter
import xlrd
import csv

#select location where keywords.xlsx exists and where it will write the final resulting spreadsheet.
rootdir = bf.getdrt()
now = datetime.datetime.now()

#delete any existing spreadsheet
for fname in os.listdir(rootdir):
    if fname.startswith("Keyword Spreadsheet"):
        os.remove(os.path.join(rootdir, fname))
        print('Existing Spreadsheet Deleted')

workbook = xlsxwriter.Workbook(rootdir + '//' + 'Keyword Spreadsheet' + '_' +
                               now.strftime('%m-%d-%y') + '.xlsx')
worksheet = workbook.add_worksheet("Sheet 1")

# Start from the first cell.
# Rows and columns are zero indexed.
row1 = 1
col = 0

#get list of files
Пример #4
0
import string
import time
import sys
import os
sys.path.insert(1, os.path.join(os.getcwd(), 'dependencies'))
import buildfolder as bf
import shutil
import random
import re

#### COMMENT THIS OUT TOO
#stdoutOrigin=sys.stdout
#sys.stdout = open("log.txt", "w")

sourcepath = bf.getdrt()
finalpath = bf.getdrt() + '\\'


def remove_punc(str):
    return re.sub("\d+", " ", ''.join(c for c in str if c not in punctuation))


def removesmallchar(sourcepath, finalpath=finalpath):
    sourcepath = sourcepath
    finalpath = finalpath
    directory = "smallfiles"
    destpath = os.path.join(finalpath + directory)

    if not os.path.exists(destpath):
        print('made folder ' + destpath)
        os.mkdir(destpath)
Пример #5
0

if __name__ == "__main__":

    #get source directory
    #get target directory
    #open the logfile
    #build q

    #iterate through q
    #check for similar file name in folder
    #tika extract
    #
    #save directly to target directory

    source = bf.getdrt('source')
    target = bf.getdrt('target')

    source_q = bf.buildq(source)
    log = open(os.path.join(target, 'logfile.txt'),
               'w+',
               encoding="utf8",
               errors='ignore')

    def process(q):

        for p, i in enumerate(
                q[38975:]
        ):  #set the counter here enumerate(q[start:]):   example enumerate(q[5:]):
            content = ''
            filename = i.split('/')[-1]
Пример #6
0
import sys 
sys.path.insert(1,os.path.join('\'.join(os.getcwd().split('\')[:-1]),'dependencies')) 
import cxwalk
import buildfolder
import os, zipfile

#Old Schedule id must be contained within the name of the zip file
#Unzip file, save to folder of schedule id contained in name of file
if __name__ == "__main__":
    print('Starting Script')
    fail = []

    #get the source/target directory
    source = buildfolder.getdrt('source')
    target = buildfolder.getdrt('target')
    
    #build the folder q
    source_q = buildfolder.buildq(source)
    
    #for each item in q, translate the item, then make the folder
    count = 0
    print(len(source_q))
    fail = open(os.path.join(target,'faillog.txt'),'w+')
    
    for item in source_q: # loop through items in q
        count += 1
        if not item.endswith(".zip"):
            continue
        try:
            zipObj = zipfile.ZipFile(item, 'r')
            print(item)
Пример #7
0
### Track the count
##

import os
import sys

sys.path.insert(1,os.path.join('\\'.join(os.getcwd().split('\\')[:-1]),'dependencies'))

import dctmdl as dd
import buildfolder as bf

#destination = r'C:\Users\mnguyen\Desktop\test'
destination = bf.getdrt()
#sourcelist = r'C:\Users\mnguyen\Environmental Protection Agency (EPA)\ECMS - Documents\github\Document_Processing_Scripts\Dwl Obj by ID from Schedule\objid.csv'
sourcelist = 'objid.csv'

import csv
import logging

logger = logging.getLogger(__name__)
#except:
#    logging.basicConfig(filename='download')

#with open(sourcelist, newline='') as csvfile:
    #reader = csv.reader(csvfile, delimiter=',')

csvfile = open(sourcelist, newline='')
reader = csv.reader(csvfile, delimiter=',')
count = 0

username = '******'