Пример #1
0
class TroveSwiftIndex(TroveIndex):
    """A Trove Index class that uses an in memory dictionary to store
    the index - for access to data stored in a Swift container"""

    def __init__(self):

        self.swifttext = SwiftTextContainer()

        super(TroveSwiftIndex, self).__init__()



    def get_document(self, id):
        """Get a document from the datafile given
        the document id. Return a Python dictionary
        with the document properties or None if there
        is no valid data at this offset"""

        try:
            offset, length, datafile = self.get(id)
        except:
            return None

        # local indexer has stored full path to the file, truncate for swift
        datafile = os.path.basename(datafile)
        line = self.swifttext.get_by_offset(datafile, offset, length)

        try:
            data = json.loads(line)
        except:
            data = None

        return data
Пример #2
0
def countwords(document):

    print "COUNT", document


    docname = document['name']

    outfile = "wordcount-" + docname

    sw = SwiftTextContainer()

    count = dict()
    wordcount = dict()

    n = 0
    for offset,line in sw.document_lines(docname):

        try:
            doc = json.loads(line.decode('utf-8'))
        except:
            sys.stdout.write('!*!')
            sys.stdout.flush()
            continue

        try:
            year = doc['date'][:4]
            wc = int(doc['wordCount'])
            if year in count:
                count[year] += 1
            else:
                count[year] = 0

            if year in wordcount:
                wordcount[year] += wc
            else:
                wordcount[year] = wc

            n += 1

            if n % INTERVAL == 0:
                write(count, wordcount, outfile)
                sys.stdout.write("%s|" % n)
                sys.stdout.flush()
        except:
            pass
Пример #3
0
    def __init__(self, datafile, out='index.idx'):
        """Create a Trove Index containing offsets of each document write it to a file.

        datafile - the name of the source data file, can be gzipped or plain text
        outdir - output directory, default 'index'
        """

        self.swifttext = SwiftTextContainer()
        self.datafile = datafile

        super(TroveSwiftIndexBuilder, self).__init__(datafile, out)
Пример #4
0
class TroveSwiftIndexBuilder(TroveIndexBuilder):
    """Build an index for documents stored in a Swift object store"""


    def __init__(self, datafile, out='index.idx'):
        """Create a Trove Index containing offsets of each document write it to a file.

        datafile - the name of the source data file, can be gzipped or plain text
        outdir - output directory, default 'index'
        """

        self.swifttext = SwiftTextContainer()
        self.datafile = datafile

        super(TroveSwiftIndexBuilder, self).__init__(datafile, out)


    def add_to_index(self, id, offset, length):
        """Add this id/offset pair to the index
        """

        # for swift we use the file baseam
        self.out.write("%s, %d, %d, %s\n" % (id, offset, length, os.path.basename(self.datafile)))


    def _build_index(self):
        """Build an index of the documents in the datafile
        """

        for offset, line in self.swifttext.document_lines(self.datafile):

            try:
                data = json.loads(line.decode('utf-8'))
            except:
                print "Bad line: ", offset, line
                continue

            if 'id' in data:
                id = data['id']
                self.add_to_index(id, offset, len(line))
            else:
                print "Bad line: ", line
Пример #5
0
    def __init__(self):

        self.swifttext = SwiftTextContainer()

        super(TroveSwiftIndex, self).__init__()
Пример #6
0
    import optparse
    import sys

    parser = optparse.OptionParser()
    parser.add_option("-s", "--swift", dest="swift", action="store_true", default=False,
                      help="read data from a swift container")
    parser.add_option("-o", "--outdir", dest="outdir", action="store", default='index',
                      help="output directory for index files")

    (options, args) = parser.parse_args()


    if not os.path.exists(options.outdir):
        os.makedirs(options.outdir)

    if options.swift:
        container = SwiftTextContainer()

        for doc in container.documents():
            print doc
            base, ext = os.path.splitext(doc['name'])
            out = os.path.join(options.outdir, base + ".idx")
            TroveSwiftIndexBuilder(doc['name'], out=out)
    else:
        for doc in args:
            print doc
            base, ext = os.path.splitext(os.path.basename(doc))
            out = os.path.join(options.outdir, base + ".idx")
            TroveIndexBuilder(doc, out=out)
Пример #7
0
                count[year] += 1
            else:
                count[year] = 0

            if year in wordcount:
                wordcount[year] += wc
            else:
                wordcount[year] = wc

            n += 1

            if n % INTERVAL == 0:
                write(count, wordcount, outfile)
                sys.stdout.write("%s|" % n)
                sys.stdout.flush()
        except:
            pass


if __name__=='__main__':

    config = readconfig()
    INTERVAL = int(config.get('default', 'WC_INTERVAL'))
    processes = int(config.get('default', 'PROCESSES'))

    sw = SwiftTextContainer()

    pool = Pool(processes)

    pool.map(countwords, sw.documents())