Python SwiftTextContainer примеры использования

Язык программирования: Python

Пространство имен/Пакет: swifttext

Класс/Тип: SwiftTextContainer

Примеров на hotexamples.com: 7

Python SwiftTextContainer - 7 примеров найдено. Это лучшие примеры Python кода для swifttext.SwiftTextContainer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

document_lines(2)

documents(2)

get_by_offset(1)

Пример #1

Показать файл

Файл: index.py Проект: stevecassidy/trovenames

class TroveSwiftIndex(TroveIndex):
    """A Trove Index class that uses an in memory dictionary to store
    the index - for access to data stored in a Swift container"""

    def __init__(self):

        self.swifttext = SwiftTextContainer()

        super(TroveSwiftIndex, self).__init__()



    def get_document(self, id):
        """Get a document from the datafile given
        the document id. Return a Python dictionary
        with the document properties or None if there
        is no valid data at this offset"""

        try:
            offset, length, datafile = self.get(id)
        except:
            return None

        # local indexer has stored full path to the file, truncate for swift
        datafile = os.path.basename(datafile)
        line = self.swifttext.get_by_offset(datafile, offset, length)

        try:
            data = json.loads(line)
        except:
            data = None

        return data

Пример #2

Показать файл

Файл: wordcount.py Проект: stevecassidy/trovenames

def countwords(document):

    print "COUNT", document


    docname = document['name']

    outfile = "wordcount-" + docname

    sw = SwiftTextContainer()

    count = dict()
    wordcount = dict()

    n = 0
    for offset,line in sw.document_lines(docname):

        try:
            doc = json.loads(line.decode('utf-8'))
        except:
            sys.stdout.write('!*!')
            sys.stdout.flush()
            continue

        try:
            year = doc['date'][:4]
            wc = int(doc['wordCount'])
            if year in count:
                count[year] += 1
            else:
                count[year] = 0

            if year in wordcount:
                wordcount[year] += wc
            else:
                wordcount[year] = wc

            n += 1

            if n % INTERVAL == 0:
                write(count, wordcount, outfile)
                sys.stdout.write("%s|" % n)
                sys.stdout.flush()
        except:
            pass

Пример #3

Показать файл

Файл: index.py Проект: stevecassidy/trovenames

    def __init__(self, datafile, out='index.idx'):
        """Create a Trove Index containing offsets of each document write it to a file.

        datafile - the name of the source data file, can be gzipped or plain text
        outdir - output directory, default 'index'
        """

        self.swifttext = SwiftTextContainer()
        self.datafile = datafile

        super(TroveSwiftIndexBuilder, self).__init__(datafile, out)

Пример #4

Показать файл

Файл: index.py Проект: stevecassidy/trovenames

class TroveSwiftIndexBuilder(TroveIndexBuilder):
    """Build an index for documents stored in a Swift object store"""


    def __init__(self, datafile, out='index.idx'):
        """Create a Trove Index containing offsets of each document write it to a file.

        datafile - the name of the source data file, can be gzipped or plain text
        outdir - output directory, default 'index'
        """

        self.swifttext = SwiftTextContainer()
        self.datafile = datafile

        super(TroveSwiftIndexBuilder, self).__init__(datafile, out)


    def add_to_index(self, id, offset, length):
        """Add this id/offset pair to the index
        """

        # for swift we use the file baseam
        self.out.write("%s, %d, %d, %s\n" % (id, offset, length, os.path.basename(self.datafile)))


    def _build_index(self):
        """Build an index of the documents in the datafile
        """

        for offset, line in self.swifttext.document_lines(self.datafile):

            try:
                data = json.loads(line.decode('utf-8'))
            except:
                print "Bad line: ", offset, line
                continue

            if 'id' in data:
                id = data['id']
                self.add_to_index(id, offset, len(line))
            else:
                print "Bad line: ", line

Пример #5

Показать файл

Файл: index.py Проект: stevecassidy/trovenames

    def __init__(self):

        self.swifttext = SwiftTextContainer()

        super(TroveSwiftIndex, self).__init__()

Пример #6

Показать файл

Файл: index.py Проект: stevecassidy/trovenames

    import optparse
    import sys

    parser = optparse.OptionParser()
    parser.add_option("-s", "--swift", dest="swift", action="store_true", default=False,
                      help="read data from a swift container")
    parser.add_option("-o", "--outdir", dest="outdir", action="store", default='index',
                      help="output directory for index files")

    (options, args) = parser.parse_args()


    if not os.path.exists(options.outdir):
        os.makedirs(options.outdir)

    if options.swift:
        container = SwiftTextContainer()

        for doc in container.documents():
            print doc
            base, ext = os.path.splitext(doc['name'])
            out = os.path.join(options.outdir, base + ".idx")
            TroveSwiftIndexBuilder(doc['name'], out=out)
    else:
        for doc in args:
            print doc
            base, ext = os.path.splitext(os.path.basename(doc))
            out = os.path.join(options.outdir, base + ".idx")
            TroveIndexBuilder(doc, out=out)

Пример #7

Показать файл

Файл: wordcount.py Проект: stevecassidy/trovenames

                count[year] += 1
            else:
                count[year] = 0

            if year in wordcount:
                wordcount[year] += wc
            else:
                wordcount[year] = wc

            n += 1

            if n % INTERVAL == 0:
                write(count, wordcount, outfile)
                sys.stdout.write("%s|" % n)
                sys.stdout.flush()
        except:
            pass


if __name__=='__main__':

    config = readconfig()
    INTERVAL = int(config.get('default', 'WC_INTERVAL'))
    processes = int(config.get('default', 'PROCESSES'))

    sw = SwiftTextContainer()

    pool = Pool(processes)

    pool.map(countwords, sw.documents())