Пример #1
0
def _convert_dataset(split_name, filenames, dataset_dir):
    assert split_name in ['train', 'test']

    with tf.Session() as sess:
        output_filename = os.path.join(TFRECORD_DIR, split_name + '.tfrecords')
        with tf.python_io.TFRecordWriter(output_filename) as tfrecord_writer:
            for i, filename in enumerate(filenames):
                try:
                    sys.stdout.write('\r>> Converting image %d/%d' %
                                     (i + 1, len(filenames)))
                    sys.stdout.flush()

                    image_data = Image.open(filename)
                    image_data = image_data.resize((224, 224))
                    image_data = np.array(image_data.convert('L'))
                    image_data = image_data.tobytes()

                    labels = filename.split('/')[-1][0:4]
                    num_labels = []
                    for j in range(4):
                        num_labels.append(int(labels[j]))
                    example = image_to_tfexample(image_data, num_labels[0],
                                                 num_labels[1], num_labels[2],
                                                 num_labels[3])
                    tfrecord_writer.write(example.SerializeToString())

                except IOError as e:
                    print('Could not read:', filename)
                    print('Error:', e)
                    print('Skip it\n')

    sys.stdout.write('\n')
    sys.stdout.flush()
    def __prepareData(self, pathsToFold, experiment, prot_attr_name):
        '''
        reads training scores and predictions from disc and arranges them NICELY into a dataframe
        '''
        pred_files = list()
        
        predictedScores = {}
        for filename in os.listdir(self.__trainingDir):
            if 'GermanCredit_'+prot_attr_name+'_ALG' in filename:
                delta = float((filename.split('=')[1]).split('.txt')[0])
                predictedScores[delta] = pd.read_csv(self.__trainingDir+'/'+filename, sep=",", header=0)
                    
                        
                         

        if 'german' in self.__dataset:
            groundtruth = pd.read_csv(self.__trainingDir+'/'+'GermanCredit_'+prot_attr_name+'.csv', sep=",", header=0)
            if self.rev:
                groundtruth['score'] = groundtruth['score'].apply(lambda val: 1-val)
            groundtruth = (groundtruth.sort_values(by=['score'], ascending=False)).reset_index(drop=True)
            
        elif 'biased_normal' in self.__dataset:
            groundtruth = pd.read_csv(self.__trainingDir+'/'+'BiasedNormalSynthetic_'+prot_attr_name+'.csv', sep=",", header=0)
            if self.rev:
                groundtruth['score'] = groundtruth['score'].apply(lambda val: 1-val)
            groundtruth = (groundtruth.sort_values(by=['score'], ascending=False)).reset_index(drop=True)
        elif 'compas' in self.__dataset:
            groundtruth = pd.read_csv(self.__trainingDir+'/'+'ProPublica_'+prot_attr_name+'.csv', sep=",", header=0)
            if not self.rev:
                groundtruth['Recidivism_rawscore'] = groundtruth['Recidivism_rawscore'].apply(lambda val: 1-val)
            groundtruth = (groundtruth.sort_values(by=['Recidivism_rawscore'], ascending=False)).reset_index(drop=True)
        groundtruth['doc_id'] = np.arange(len(groundtruth))+1
        return predictedScores, groundtruth
Пример #3
0
 def retrieve_htmlpage_identifier(self,filename):
     str=list()
     filename = filename.replace("http://", "").replace("https://", "")
     for part in filename.split("/"):
         if not("." in part):
             if utils.contains_digits(part):
                 str.append(part)
     return str
Пример #4
0
def imageUpload():
    if request.files['file']:
        file = request.files['file']
        ext = file.filename.split(".")[-1]
        #if file.filename  is None or file.filename  == '':
            #file.filename = ''
            #return file.filename  
        if ext.lower() in ALLOWED_EXTENSIONS:
            filename = secure_filename(file.filename)
            file.filename = filename.split('.')[0] + '_' + str(uuid.uuid4()) + '.' + filename.split('.')[1]
            file.save(os.path.join(app.config['UPLOAD_FOLDER'], file.filename))
            return file.filename
        elif ext.lower() not in ALLOWED_EXTENSIONS:
            print ext.lower()
            #flash('ERROR! Please upload only jpeg, png, gif or tif.', 'danger')
            return redirect(request.url)
    else:
        return 0        
Пример #5
0
def imageUpload():
    # check if the post request has the file part
    #if 'file' not  in request.files:
    #return redirect(request.url)
    if 'file' in request.files:
        #return redirect(request.url)
        file = request.files['file']
    else:
        file.filename = ''
        return file.filename

    if file.filename.split(".")[1] in ALLOWED_EXTENSIONS:
        print 1

        #if file and (file.filename.split(".")[1] == "jpeg" or file.filename.split(".")[1] == "jpg"):
        filename = secure_filename(file.filename)
        file.filename = filename.split('.')[0] + '_' + str(
            uuid.uuid4()) + '.' + filename.split('.')[1]
        file.save(os.path.join(app.config['UPLOAD_FOLDER'], file.filename))
        return file.filename

    elif file.filename is None or file.filename == '':
        file.filename = ''
        return file.filename

    elif file.filename.split(".")[1] not in ALLOWED_EXTENSIONS:
        print 0
        #file.filename = ''
        flash('ERROR! Please upload only jpeg, png, gif or tif.', 'danger')

        #return file.filename
        #elif  file and (file.filename.split(".")[1] != "jpeg" or file.filename.split(".")[1] != "jpg"):
        form = addPointForm(request.form)
        d_conn = pg_operations2.pg_connect2(database, user, password, host,
                                            port)
        conn = d_conn['conn']
        conn.rollback()
        d_conn = pg_operations2.pg_disconnect2(d_conn)

        flash('ERROR! Please uplod only jpeg, png, gif or tif.', 'danger')
        return render_template('registerTrashReporter..html', form=form)
Пример #6
0
def getCaptionRowAndPosition(filepath):
    # Get the filename part, without the extension
    filename, extension = os.path.splitext(os.path.split(filepath)[1])

    # If it starts with numbers, we have a row (and position?) section
    row = -1  # No row specified
    position = -1  # No position
    caption = ""
    if filename[0].isdigit():
        # Get the first part, e.g "10-01" or "10"
        row_pos = filename.split()[0]

        # The row is the first part, always
        row = int(row_pos.split("-")[0])

        # The position is the second part, if it exists
        if len(row_pos.split("-")) > 1:
            position = int(row_pos.split("-")[1])

        caption = filename.split(" ", 1)[1]
        #print "Split %s to get %s" % (filename, caption)
    else:
        caption = filename

    # Hacks to work around the oversize font:
    # 1. Prepend a space
    caption = " " + caption
    # 2. If the last four characters are NOT digits, append up to 4 digits
    #    Note: we could be cleverer about this and append fewer for smaller letters
    append = " "
    for i in range(-3, 0, 1):
        if not caption[i].isdigit():
            # Append 2 spaces if the non-digit is 3rd from the end,
            # or 3 if it is 3 from the end etc.
            append = " " * (i + 5)
    caption = caption + append

    #print "Row %d, position %d, caption %s" % (row, position, caption)
    return caption, row, position
Пример #7
0
 def extractPageNumberFromFilename(self, filename):
     if utils.contains_digits(filename) == False:
         return 0
     
     filePrefix = filename.split(".")[0]
     pageString = filePrefix.split("_")[-1]
     if not ('page' in pageString):
         for part in filePrefix:
             if 'page' in part:
                 pageString = part
     
     pageStr = ''.join(x for x in pageString if x.isdigit())
     filePage = int(pageStr)
     return filePage
Пример #8
0
 def move_to_another(self, filename):
     try:
         entity_type = filename.split('|')[1]
         remote_filename = self._get_remote_filename(filename)
         if not remote_filename:
             return
         if (entity_type == 'County') or (entity_type == 'City') or \
                 (entity_type == 'Township') or (entity_type == 'Village'):
             return
         directory, server_filename = remote_filename
         self.ftp.rename('/General Purpose/{}'.format(server_filename),
                         '/{}/{}'.format(directory, server_filename))
         print('Moved {} to {}'.format(server_filename, directory))
     except Exception as e:
         print(str(e))
 def filter_already_present(self, json_filenames: List[str],
                            spider: str) -> List[str]:
     # Retrieve the already processed files from the database and get their file names
     table_string = f"file {join_decision_and_language_on_parameter('file_id', 'file.file_id')}"
     where_string = f"file.file_id IN {where_string_spider('file_id', spider)}"
     all_filenames_of_spider = self.select(self.get_engine(self.db_scrc),
                                           table_string, "file_name",
                                           where_string)
     for filename_chunk in all_filenames_of_spider:
         filename_chunk = list(filename_chunk['file_name'])
         json_filenames = [
             filename for filename in json_filenames
             if filename.split('/')[-1].split('.')[0] not in filename_chunk
         ]
     return json_filenames
Пример #10
0
 def extractVolumeIssueFromFilename(self, filename):
     name=volume=issue=""
     if utils.contains_digits(filename) == False:
         return 0
     filenameArr = filename.split(".")
     filePrefix = filenameArr[0]
     file = filePrefix.split("_")
     
     leng = len(file)
     if leng>=4:
         name = file[1]+" "+file[2]
         volume = file[(leng-2)]
         if volume != file[4]:
             issue = ""+file[3] +"_"+ file[4]
         else: 
             issue = ""+file[3]
     else:
         return 0
     
     issue = issue.replace("-", "")
     
     return name, volume, issue   
def readSinglePatterns(root, resultListBox):
    for filename in os.listdir(singlePatternPath):
        if(checkFileName(filename)):
            try:
                data = xlrd.open_workbook(singlePatternPath + filename)
                table = data.sheets()[0]
                ncols = table.ncols
                row0 = table.row_values(0)
                row1 = table.row_values(1)
                pattern ={}
                for i in range(0,ncols):
                    pattern[row0[i]] = row1[i]
                fileNameKey = filename.split('.',1)[0]
                singlePatternDict[fileNameKey] = pattern
                #MessageBoxPromt(0, 'Exception' + '\n  good', root )
                resultListBox.insert(END, 'Successfully read pattern ' + filename + '!')
            except Exception as e:
                MessageBoxPromt(0, 'Exception '  + str(e) + '\n  ERROR 1 in reading pattern file', root )
                resultListBox.insert(END, 'Failed to  read pattern ' + filename + '! Exception: ' + str(e))
                print (e)
                return 0
    return 1
Пример #12
0
 def execute_with_params(self, file_pattern="", path="", magazines=None,articles=None):
     listOfMagazines=[]
     container = dict()
     for folder, subs, files in os.walk(path):
         with open(os.path.join(folder, file_pattern), 'w') as dest:
             docValue=""
             prevId=""
             identifier=""
             page_count=0
             for filename in files:
                 #print("Filename "+filename)
                 if filename == file_pattern:
                     pass                    
                 elif filename.endswith(file_pattern):
                     document=""
                     if not folder in path:
                         document = folder+"/"+filename
                     else:
                         document = path+filename
                     doc = self.read(document)
                     if magazines==None:
                         pass
                         #container = self.saveValuesForNewIdentifier(container, identifier, doc)
                     elif len(doc)<1:
                         pass
                     else:
                         fileId = self.getMagazine(filename)
                         mag=magazines[fileId]
                         if fileId not in listOfMagazines:
                             listOfMagazines.append(fileId)
                         page = self.extractPageNumberFromFilename(filename)
                         if page > 0:
                             article = mag.find_article_by_page(page)
                             logger.debug(article)
                             logger.debug(filename)
                             
                             #get filename
                             name=""
                             str_name = filename.split("page")
                             try:
                                 name = str_name[0]
                             except:
                                 name = filename
                             mag.set_name(name)
                             if article != None: #If article exists
                                 if len(doc)<5 and len(values[-1:])<3900:
                                     logger.info("SMALL "+str(len(doc)) + " = "+doc)
                                     pass
                                 if len(doc)>0:
                                     self.setLengths(len(doc))
                                     #split value in case too long to process
                                     if len(doc)>3000:
                                         sentences=""
                                         splitted = doc.split(' ')
                                         for split in splitted:
                                             lenn = len(sentences) + len(split) + 1 #+1 for the space
                                             if lenn > 3000:
                                                 article.addText(sentences, page)
                                                 sentences = ""
                                             sentences += split+" "
                                         if len(sentences) > 0:
                                             article.addText(sentences, page)
                                     else:
                                         article.addText(doc, page)
                                         article.set_len(len(doc))
                             else:
                                 #in case we cannot find article
                                 article = Article(filename, page, "")
                                 article.addText(doc, page)
                                 article.set_len(len(doc))
                                 self.setLengths(len(doc))
                                 mag.add_article(article)    
                         else:
                                 #if article not found for the document, store pages page by page
                             print("stored file "+filename+" as article was not found")
                             article = Article(filename, page, "")
                             article.addText(doc, page)
                             article.set_len(len(doc))
                             mag.add_article(article) 
                             self.setLengths(len(doc))
                             
                         magazines[fileId] = mag 
                         #mag.log_articles_and_contents()
                         
                 elif filename.endswith(".xml"):
                     page_count = 1+page_count
                     article = Article(filename, page, "")
                     doc, document = self.readDocument(path, folder, filename, document)
                     if len(doc)>0:
                         xml = xmlParser(input_file=doc)
                         if bool(BeautifulSoup(html, "html.parser").find()) == True:
                             self.setLengths(len(doc))
                             article.set_len(len(doc))
                             html = htmlParser(doc)
                             article = self.split_document(html.get_text(), article, page_count)
                             result.append(article)
                     else:
                         #process
                         pass
                 elif filename.endswith(".html"):
                     page_count = 1+page_count
                     article = Article(filename, page, "")
                      
                     doc, document = self.readDocument(path, folder, filename, document)
                     if len(doc)>0:
                         self.setLengths(len(doc))
                         article.set_len(len(doc))
                         html = htmlParser(doc)
                         article = self.split_document(html.get_text(), article, page_count)
                         result.append(article)
                     else:
                         #process
                         pass
             dest.close()              
                  
     result = []
     logger.debug("VALUES FOR magazines "+str(len(listOfMagazines)))
     for id in listOfMagazines:
         if magazines[id] not in result:
             result.append(magazines[id])
     return result, listOfMagazines
Пример #13
0
def ArchiveFileS3(bucketname, filename, logfile, **s3params):

    #-----------------------------------------------------------
    #logging
    #-----------------------------------------------------------
    if logfile == '':
        msg = "logfile is mandatory, exiting"
        print(msg)
        sys.exit(1)

    currenttime = datetime.now().strftime('%d%m%Y_%H%M%S')
    logfileobj = open(logfile, "a")

    msg = "ArchiveFileS3 process started"
    logfileobj.write("\n{}: {}".format(currenttime, msg))
    print(msg)

    #-----------------------------------------------------------
    #input parameters check and variable declration
    #-----------------------------------------------------------

    msg = ''
    if len(s3params) == 0:
        msg = "s3 params are not provided, exiting"
    elif bucketname == '':
        msg = "sourcebucket is mandatory, exiting"
    elif filename == '':
        msg = "targetbucket is mandatory, exiting"

    if len(msg) != 0:
        currenttime = datetime.now().strftime('%d%m%Y_%H%M%S')
        logfileobj.write("\n{}: {}".format(currenttime, msg))
        print(msg)
        sys.exit(1)

    df = GetFileDatas3(bucketname, filename, logfile, **s3params)

    if 'date' in df.columns:
        datevalue = df['date'].max()
        datevalue = datetime.strftime(datetime.strptime(datevalue, '%d/%m/%Y'),
                                      '%d%m%Y')
    else:
        datevalue = datetime.now().strftime('%d%m%Y')

    #print(datevalue)
    archivebucket = bucketname + 'archive'
    archivefilename = filename.split('.')[0]
    archivefilextension = filename.split('.')[1]
    archivefile = archivefilename + '_' + datevalue + '.' + archivefilextension
    #print (archivefile)

    #-----------------------------------------------------------
    #archive bucket creation
    #-----------------------------------------------------------

    CreateBucketS3(archivebucket, logfile, **s3params)
    time.sleep(3)
    #-----------------------------------------------------------
    #file copy
    #-----------------------------------------------------------

    tobedeleted = CopyFileS3(bucketname, archivebucket, filename, archivefile,
                             logfile, **s3params)

    #-----------------------------------------------------------
    #delete file
    #-----------------------------------------------------------

    if tobedeleted == 1:
        DeleteFileS3(bucketname, filename, logfile, **s3params)
Пример #14
0
        and c in all_letters)
    
    
print(unicodeToAscii('Ślusàrski'))

# Build the category_lines a dicttionary, a list a names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/names/*.txt'):
    category = filename.split('/')[-1].split('.')[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

print(category_lines['names\\Korean'][:5])


import torch

# Finde letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)