Пример #1
0
def read_tsv(argv,file_encoding="latin-1"):
    filename = "../testdata/norwegian_words.txt"
    if len(argv) > 1:
        filename = argv[1]
    import codecs
    p2 = patricia()
    allwords = []

    key_value = {}

    for line in codecs.open(filename, encoding=file_encoding):
        #print sys.stderr, "line = ", [line]
        word, value = line.strip().split("\t")#.lower()
        print >> sys.stderr, [word, value]
        word = json.loads(word.strip())
        value = json.loads(value.strip())
        nvalue = ",".join(value)
        allwords.append(word)
        if not p2.isWord(word):
            p2.addWord(word)
            key_value[word] = nvalue

    added = {}

    for word in allwords:
        if not added.has_key(word) and len(word) > 0:
            #p2.addVal(word, word[::-1])
            p2.addVal(word, key_value[word])
            #p2.addVal(word, word)
            added[word] = True

    return p2._data, allwords
Пример #2
0
def read_input(argv,file_encoding="latin-1"):
    filename = "../testdata/norwegian_words.txt"
    if len(argv) > 1:
        filename = argv[1]
    import codecs
    p2 = patricia()
    allwords = []

    key_value = {}

    for line in codecs.open(filename, encoding=file_encoding):
        #print sys.stderr, "line = ", [line]
        #word, value = line.strip().split("\t")#.lower()wor
        word = json.loads(line.strip())

        if '/' in word:
            print >> sys.stderr, "Skipping word:", [word]
            continue
        #word = json.loads(word.strip())
        #value = json.loads(value.strip())
        #nvalue = ",".join(value)
        if not p2.isWord(word):
            allwords.append(word) # or above?
            p2.addWord(word)

    added = {}

    for word in allwords:
        if not added.has_key(word) and len(word) > 0:
            p2.addVal(word, word)
            added[word] = True

    return p2._data, allwords
Пример #3
0
def read_tsv(argv, file_encoding="latin-1"):
    filename = "../testdata/norwegian_words.txt"
    if len(argv) > 1:
        filename = argv[1]
    import codecs
    p2 = patricia()
    allwords = []

    key_value = {}

    for line in codecs.open(filename, encoding=file_encoding):
        #print sys.stderr, "line = ", [line]
        word, value = line.strip().split("\t")  #.lower()
        word = json.loads(word.strip())
        value = json.loads(value.strip())
        nvalue = ",".join(value)
        allwords.append(word)
        if not p2.isWord(word):
            p2.addWord(word)
            key_value[word] = nvalue

    added = {}

    for word in allwords:
        if not added.has_key(word) and len(word) > 0:
            #p2.addVal(word, word[::-1])
            p2.addVal(word, key_value[word])
            #p2.addVal(word, word)
            added[word] = True

    return p2._data, allwords
Пример #4
0
def read_input(argv, file_encoding="latin-1"):
    filename = "../testdata/norwegian_words.txt"
    if len(argv) > 1:
        filename = argv[1]
    import codecs
    p2 = patricia()
    allwords = []

    key_value = {}

    for line in codecs.open(filename, encoding=file_encoding):
        #print sys.stderr, "line = ", [line]
        #word, value = line.strip().split("\t")#.lower()wor
        word = json.loads(line.strip())

        if '/' in word:
            print >> sys.stderr, "Skipping word:", [word]
            continue
        #word = json.loads(word.strip())
        #value = json.loads(value.strip())
        #nvalue = ",".join(value)
        if not p2.isWord(word):
            allwords.append(word)  # or above?
            p2.addWord(word)

    added = {}

    for word in allwords:
        if not added.has_key(word) and len(word) > 0:
            p2.addVal(word, word)
            added[word] = True

    return p2._data, allwords
Пример #5
0
    def reducer_init(self):
        try:
            self.all_keys = []
            self.patricia_tree = patricia()
            self.keyvalue = {}  #atbr.Atbr()

        except Exception, e:
            self.increment_counter("reducer_init", str(e), 1)
Пример #6
0
    def reducer_init(self):
        try:
            self.all_keys = []
            self.patricia_tree = patricia()
            self.keyvalue = {} #atbr.Atbr()

        except Exception, e:
                self.increment_counter("reducer_init", str(e), 1)
Пример #7
0
def read_input(argv,file_encoding="latin-1"):
    if len(argv) != 3:
        print "usage: %s <key_file> <key_value_file>"
        sys.exit(1)
    filename = argv[1]
    key_value_filename = argv[2]

    import codecs
    p2 = patricia()
    allwords = []

    key_value = {}
    
    i = 0
    for line in codecs.open(filename, encoding=file_encoding):
        #print sys.stderr, "line = ", [line]
        #word, value = line.strip().split("\t")#.lower()wor
        word = json.loads(line.strip())
        #word = json.loads(word.strip())
        #value = json.loads(value.strip())
        #nvalue = ",".join(value)
        if not p2.isWord(word):
            allwords.append(word) # or above?
            p2.addWord(word)
            if i % 10000 == 0:
                print >> sys.stderr, "added key number ", i
            i += 1

    added = {}

    j = 0
    for word in allwords:
        if not added.has_key(word) and len(word) > 0:
            p2.addVal(word, word[::-1])
            added[word] = True
            if j % 10000 == 0:
                print >> sys.stderr, "added key number ", j
            j += 1
                


    return p2._data, allwords, key_value_filename
Пример #8
0
def read_input(argv, file_encoding="latin-1"):
    if len(argv) != 3:
        print "usage: %s <key_file> <key_value_file>"
        sys.exit(1)
    filename = argv[1]
    key_value_filename = argv[2]

    import codecs
    p2 = patricia()
    allwords = []

    key_value = {}

    i = 0
    for line in codecs.open(filename, encoding=file_encoding):
        #print sys.stderr, "line = ", [line]
        #word, value = line.strip().split("\t")#.lower()wor
        word = json.loads(line.strip())
        #word = json.loads(word.strip())
        #value = json.loads(value.strip())
        #nvalue = ",".join(value)
        if not p2.isWord(word):
            allwords.append(word)  # or above?
            p2.addWord(word)
            if i % 10000 == 0:
                print >> sys.stderr, "added key number ", i
            i += 1

    added = {}

    j = 0
    for word in allwords:
        if not added.has_key(word) and len(word) > 0:
            p2.addVal(word, word[::-1])
            added[word] = True
            if j % 10000 == 0:
                print >> sys.stderr, "added key number ", j
            j += 1

    return p2._data, allwords, key_value_filename
Пример #9
0
def big_test(argv,file_encoding="latin-1"):
    filename = "../testdata/norwegian_words.txt"
    if len(argv) > 1:
        filename = argv[1]
    import codecs
    p2 = patricia()
    allwords = []
    for line in codecs.open(filename, encoding=file_encoding):
        #print sys.stderr, "line = ", [line]
        word = line.strip()#.lower()
        allwords.append(word)
        if not p2.isWord(word):
            p2.addWord(word)

    added = {}

    for word in allwords:
        if not added.has_key(word) and len(word) > 0:
            p2.addVal(word, word[::-1])
            #p2.addVal(word, word)
            added[word] = True
  
    return p2._data, allwords
Пример #10
0
def big_test(argv, file_encoding="latin-1"):
    filename = "../testdata/norwegian_words.txt"
    if len(argv) > 1:
        filename = argv[1]
    import codecs
    p2 = patricia()
    allwords = []
    for line in codecs.open(filename, encoding=file_encoding):
        #print sys.stderr, "line = ", [line]
        word = line.strip()  #.lower()
        allwords.append(word)
        if not p2.isWord(word):
            p2.addWord(word)

    added = {}

    for word in allwords:
        if not added.has_key(word) and len(word) > 0:
            p2.addVal(word, word[::-1])
            #p2.addVal(word, word)
            added[word] = True

    return p2._data, allwords