예제 #1
0
def repairIdInCsv(fileName):
    fileNameWrite = fileName + 'backup'
    new_rows_list = []
    with h.ManagedUtfFile(fileName) as f:
        csvReader = csv.reader(f)
        count = 0
        data = False
        for row in csvReader:
            if data == False:
                new_rows_list.append(row)
                data = True
            else:
                if len(row) > 0:
                    count = count + 1
                    imdbIDOld = row[0]
                    length = len(imdbIDOld)
                    if length < 7:
                        imdbINew = ("0" * (7 - length)) + imdbIDOld
                    else:
                        imdbINew = imdbIDOld
                    newrow = [imdbINew,]
                    newrow.extend(row[1:])
                    print(newrow)
                    new_rows_list.append(newrow)

    with h.ManagedUtfFile(fileName, mode='w') as fw:

        # file2 = open(file.csv, 'wb')
        writer = csv.writer(fw)
        writer.writerows(new_rows_list)
예제 #2
0
 def test_openFileOk(self):
     with h.ManagedUtfFile(testFile) as f:
         csvReader = csv.reader(f)
         data = False
         for row in csvReader:
             if data == False:
                 data = True
             else:
                 items = row
                 # if len(items) > 0:
                 # print(items[0])
         assert data == True
예제 #3
0
 def test_openCsvGetTsv(self):
     with h.ManagedUtfFile(testFile) as f:
         csvReader = csv.reader(f)
         data = False
         for row in csvReader:
             if data == False:
                 data = True
             else:
                 if len(row) > 0:
                     movieId = row[0]
                     print(row, movieId)
                     imdbData = tsv.getMovieData(movieId)
                     if imdbData != None:
                         print(imdbData)
                     else:
                         print('No data found')
예제 #4
0
def readFileAddItemsToDb(fileName):
    with h.ManagedUtfFile(fileName) as f:
        csvReader = csv.reader(f)
        count = 0
        data = False
        for row in csvReader:
            if data == False:
                data = True
            else:
                if len(row) > 0:
                    count = count + 1
                    imdbID = row[0]
                    # only rows with not empty and zero imdbId
                    # if imdbID != '' and imdbID != '0000000':
                    if len(row) > 4:
                        titlelocal = row[4]
                    else:
                        titlelocal = ''
                    if len(row) > 5:
                        medium = row[5]
                        if medium == '':
                            medium = '-'
                    else:
                        medium = '-'

                    if len(row) > 6:
                        place = row[6]
                        if place == '':
                            place = '-'
                    else:
                        place = '-'

                    # (imdbID, EAN, title, titleorig, titlelocal, medium, nr, source) = \
                    #     row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]

                    if imdbID != '' and imdbID != '0000000':
                        dbc.insertMovieData(inputMovieId=imdbID,
                                            inputTitle=titlelocal,
                                            place=place,
                                            medium=medium)
                    else:  # rows without imdbId
                        dbc.addManMovieWithoutIdToDb(inputTitle=titlelocal,
                                                     place=place,
                                                     medium=medium)
예제 #5
0
    def testCsvInput(self):
        ''' first version to show the functionality'''
        with h.ManagedUtfFile(testFile) as f:
            csvReader = csv.reader(f)
            count = 0
            data = False
            for row in csvReader:
                if data == False:
                    data = True
                else:
                    if len(row) > 0:
                        count = count + 1
                        movieId = row[0]
                        (imdbID, EAN, title, titleorig, titlelocal, medium, nr, source) = \
                        row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]
                        print(imdbID, EAN, title, titleorig, titlelocal, medium, nr, source)
                        dbc.addManMovieToDb(inputMovieId=imdbID, inputTitle=titlelocal, medium=medium)

            movies = Movie.query.all()
            c = len(movies)
            assert c == count
예제 #6
0
def findLineWithId(filename, matchIdRaw, delimiter='\t', quiet=True):
    '''binary search in TSV files from IMDB'''
    try:

        # check if ID is already 7 char long
        length = len(matchIdRaw)
        if length < 7:
            matchId = ("0" * (7 - length)) + matchIdRaw
        else:
            matchId = matchIdRaw

        counter = 0
        helperCounter = 1
        lineLengthInBytes = 1
        start = 0
        lastId = ''
        lineId = ''
        pwd = os.getcwd()
        if quiet != True:
            print(pwd , filename)
        end = os.path.getsize(filename)
        endBinary = "{0:b}".format(end)
        maxLoopCount = len(endBinary)
        # with helper.ManagedFile(filename) as fptr:
        with helper.ManagedUtfFile(filename) as fptr:

            while (start < end) and (counter < (maxLoopCount + 4)):
                lastId = lineId
                pos = start + ((end - start) / 2)

                fptr.seek(pos)
                fptr.readline()
                line = fptr.readline()
                lineLength = len(line)
                if counter == 0:
                    lineLengthInBits = "{0:b}".format(lineLength)
                    lineLengthInDigits = len(lineLengthInBits)
                values = line.split(sep=delimiter)
                firstValue = values[0]
                lineId = firstValue[2:]# ignore the first 2 chars
                if quiet != True:
                    print('lineId {} length {} start {} end {}'.format(lineId, lineLength, start, end))
                counter = counter + 1
                if matchId == lineId:
                    if quiet != True:
                        print("counter = {0} / {1}".format(counter, maxLoopCount))
                    return line
                elif matchId > lineId:
                    # newer dirty trick, from the near of the match, search linear, because of various line length
                    if maxLoopCount - counter < lineLengthInDigits + 6:
                        while lineId < matchId:
                            line = fptr.readline()
                            values = line.split(sep=delimiter)
                            firstValue = values[0]
                            lineId = firstValue[2:]  # ignore the first 2 chars
                            counter = counter + 1
                            if matchId == lineId:
                                if quiet != True:
                                    print("counter = {0} / {1}".format(counter, maxLoopCount))
                                return line
                    else:
                        start = fptr.tell()
                        if start > end:
                            end = start + 1
                else:
                    # dirty trick to fix various length of lines
                    if lastId == lineId:
                        helperCounter = helperCounter + 1
                        end = fptr.tell() - helperCounter * 4 * lineLength
                        if start > end:
                            start = end - 1
                    else:
                        end = fptr.tell()
            if quiet != True:
                print("counter = {}, max = {}".format(counter, maxLoopCount))
            return []
    except:
        current_app.logger.error('Unhandled exception', exc_info=sys.exc_info())
        print("Oops!", sys.exc_info()[0], "occured.")
        return []