def repairIdInCsv(fileName): fileNameWrite = fileName + 'backup' new_rows_list = [] with h.ManagedUtfFile(fileName) as f: csvReader = csv.reader(f) count = 0 data = False for row in csvReader: if data == False: new_rows_list.append(row) data = True else: if len(row) > 0: count = count + 1 imdbIDOld = row[0] length = len(imdbIDOld) if length < 7: imdbINew = ("0" * (7 - length)) + imdbIDOld else: imdbINew = imdbIDOld newrow = [imdbINew,] newrow.extend(row[1:]) print(newrow) new_rows_list.append(newrow) with h.ManagedUtfFile(fileName, mode='w') as fw: # file2 = open(file.csv, 'wb') writer = csv.writer(fw) writer.writerows(new_rows_list)
def test_openFileOk(self): with h.ManagedUtfFile(testFile) as f: csvReader = csv.reader(f) data = False for row in csvReader: if data == False: data = True else: items = row # if len(items) > 0: # print(items[0]) assert data == True
def test_openCsvGetTsv(self): with h.ManagedUtfFile(testFile) as f: csvReader = csv.reader(f) data = False for row in csvReader: if data == False: data = True else: if len(row) > 0: movieId = row[0] print(row, movieId) imdbData = tsv.getMovieData(movieId) if imdbData != None: print(imdbData) else: print('No data found')
def readFileAddItemsToDb(fileName): with h.ManagedUtfFile(fileName) as f: csvReader = csv.reader(f) count = 0 data = False for row in csvReader: if data == False: data = True else: if len(row) > 0: count = count + 1 imdbID = row[0] # only rows with not empty and zero imdbId # if imdbID != '' and imdbID != '0000000': if len(row) > 4: titlelocal = row[4] else: titlelocal = '' if len(row) > 5: medium = row[5] if medium == '': medium = '-' else: medium = '-' if len(row) > 6: place = row[6] if place == '': place = '-' else: place = '-' # (imdbID, EAN, title, titleorig, titlelocal, medium, nr, source) = \ # row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7] if imdbID != '' and imdbID != '0000000': dbc.insertMovieData(inputMovieId=imdbID, inputTitle=titlelocal, place=place, medium=medium) else: # rows without imdbId dbc.addManMovieWithoutIdToDb(inputTitle=titlelocal, place=place, medium=medium)
def testCsvInput(self): ''' first version to show the functionality''' with h.ManagedUtfFile(testFile) as f: csvReader = csv.reader(f) count = 0 data = False for row in csvReader: if data == False: data = True else: if len(row) > 0: count = count + 1 movieId = row[0] (imdbID, EAN, title, titleorig, titlelocal, medium, nr, source) = \ row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7] print(imdbID, EAN, title, titleorig, titlelocal, medium, nr, source) dbc.addManMovieToDb(inputMovieId=imdbID, inputTitle=titlelocal, medium=medium) movies = Movie.query.all() c = len(movies) assert c == count
def findLineWithId(filename, matchIdRaw, delimiter='\t', quiet=True): '''binary search in TSV files from IMDB''' try: # check if ID is already 7 char long length = len(matchIdRaw) if length < 7: matchId = ("0" * (7 - length)) + matchIdRaw else: matchId = matchIdRaw counter = 0 helperCounter = 1 lineLengthInBytes = 1 start = 0 lastId = '' lineId = '' pwd = os.getcwd() if quiet != True: print(pwd , filename) end = os.path.getsize(filename) endBinary = "{0:b}".format(end) maxLoopCount = len(endBinary) # with helper.ManagedFile(filename) as fptr: with helper.ManagedUtfFile(filename) as fptr: while (start < end) and (counter < (maxLoopCount + 4)): lastId = lineId pos = start + ((end - start) / 2) fptr.seek(pos) fptr.readline() line = fptr.readline() lineLength = len(line) if counter == 0: lineLengthInBits = "{0:b}".format(lineLength) lineLengthInDigits = len(lineLengthInBits) values = line.split(sep=delimiter) firstValue = values[0] lineId = firstValue[2:]# ignore the first 2 chars if quiet != True: print('lineId {} length {} start {} end {}'.format(lineId, lineLength, start, end)) counter = counter + 1 if matchId == lineId: if quiet != True: print("counter = {0} / {1}".format(counter, maxLoopCount)) return line elif matchId > lineId: # newer dirty trick, from the near of the match, search linear, because of various line length if maxLoopCount - counter < lineLengthInDigits + 6: while lineId < matchId: line = fptr.readline() values = line.split(sep=delimiter) firstValue = values[0] lineId = firstValue[2:] # ignore the first 2 chars counter = counter + 1 if matchId == lineId: if quiet != True: print("counter = {0} / {1}".format(counter, maxLoopCount)) return line else: start = fptr.tell() if start > end: end = start + 1 else: # dirty trick to fix various length of lines if lastId == lineId: helperCounter = helperCounter + 1 end = fptr.tell() - helperCounter * 4 * lineLength if start > end: start = end - 1 else: end = fptr.tell() if quiet != True: print("counter = {}, max = {}".format(counter, maxLoopCount)) return [] except: current_app.logger.error('Unhandled exception', exc_info=sys.exc_info()) print("Oops!", sys.exc_info()[0], "occured.") return []