Exemplo n.º 1
0
def load_data():
    r = csv.reader(open(predict_file, 'r', encoding='utf-8'),
                   delimiter=',',
                   quotechar='"')

    raw_data = np.array(list(r))
    s.print_data("raw", raw_data)
    str_data = np.delete(raw_data[1:, 1:], [6], axis=1)
    po_nums = raw_data[1:, 0]

    return s.normalize(str_data.astype('float32')), po_nums
Exemplo n.º 2
0
def parse(tokens):
    """
    parse tokenized query to simplified executable list structure based on Shunting-yard algorithm
    :param tokens: tokens of a query, including words and strings of 'AND', 'OR', 'NOT', '(', and ')'
    :return: parsed tokens
    """
    # Parse
    special_tokens = ['AND', 'OR', 'NOT', '(', ')']
    ops = {
        'AND': 3,
        'OR': 2,
    }  # precedence
    output = []
    op_stack = []
    # Adapted from pseudo code provided in the Shunting-yard algorithm wikipedia page
    for i in range(len(tokens)):
        if not tokens[i] in special_tokens:
            word = normalize(tokens[i])
            output.append(word)
        elif tokens[i] == 'NOT':
            op_stack.append('NOT')
        elif tokens[i] in ops:
            while op_stack and op_stack[-1] != '(' and ops[
                    op_stack[-1]] >= ops[tokens[i]]:
                output.append(op_stack.pop())
            op_stack.append(tokens[i])
        elif tokens[i] == '(':
            op_stack.append('(')
        elif tokens[i] == ')':
            while op_stack and op_stack[-1] != '(':
                output.append(op_stack.pop())
            if op_stack and op_stack[-1] == '(':
                op_stack.pop()
            if op_stack and op_stack[-1] == 'NOT':
                output.append(op_stack.pop())
    while op_stack:
        output.append(op_stack.pop())
    # Simplify
    i = 0
    while i < len(output):
        # Double negation is canceled
        if output[i] == 'NOT' and output[i - 1] == 'NOT':
            output[(i - 1):] = output[(i + 1):]
        # Use De Morgan's Law to take OR in NOT out
        elif output[i] == 'NOT' and output[i - 1] == 'OR':
            first_operand = get_operand(output, i - 1)
            second_operand = get_operand(output, i - 1 - len(first_operand))
            remaining = i + 1
            i = i - 2 - len(first_operand) - len(second_operand)
            output[(i + 1):] = second_operand + ['NOT'] + first_operand + [
                'NOT', 'AND'
            ] + output[remaining:]
        i = i + 1
    return output
Exemplo n.º 3
0
    def resize_save_preview(self, outpath, target_shape=[4000, 4000]):
        h_, w_, *_ = self.shape
        rrate_ = min(target_shape[0]/h_, target_shape[1]/w_)
        target_shape = (np.array([h_, w_], dtype=Config.TYPE_FLOAT)*rrate_).astype(dtype=np.int)

        tmppath = path.join(Config.TEMP_DIR, str(uuid.uuid4()))
        resized = DiskMap(tmppath, dtype=self.dtype, mode='w+', shape=self.shape)
        resized[:] = self[:]
        resized = imresize(resized, target_shape)
        normalized = (normalize(resized)*255.).astype(np.uint8)
        imsave(outpath, exposure.rescale_intensity(normalized))
        del normalized
        print('Resized preview saved', outpath, target_shape)

        DiskMap.remove_file(resized)
Exemplo n.º 4
0
def bigram_inverted_index(collection):
    bigram_index = defaultdict(set)
    bigram_iindex = defaultdict(set)
    for doc in collection:
        # preprocess without lemmatization
        new_text = normalize(doc.full_text())
        tokens = tokenize(new_text)
        text_tokens = remove_stop_word(tokens)
        for word in set(text_tokens):
            if word in bigram_index:
                continue
            bigrams = get_bigrams(word)
            bigram_index[word] = set(bigrams)
            for bigram in bigrams:
                bigram_iindex[bigram].add(word)
    return bigram_index, bigram_iindex
Exemplo n.º 5
0
def load_process_data_b():
    f1 = open(pos_file, 'r' ,encoding='utf-8')
    f2 = open(neg_file, 'r' ,encoding='utf-8')
    r1 = csv.reader(f1, delimiter=',', quotechar='"')
    r2 = csv.reader(f2, delimiter=',', quotechar='"')

    pos_data = process_b(np.array(list(r1)))
    neg_data = process_b(np.array(list(r2)))
    data = np.concatenate((pos_data, neg_data), axis=0)
    np.random.shuffle(data)

    #x_data = np.delete(data, [5], axis=1)#第5列是 violation 其实也就是train 中的y_data 因此删掉
    x_data = s.normalize(np.delete(data, [5], axis=1)) #第5列是 violation 其实也就是train 中的y_data,因此删掉
    #s.print_data("normalized data", x_data)
    y_data = data[:, 5]
    #len_data = data.shape[0] // 1280 * 1280
    #print("25016 trimmed is %d" %(25016 // 1280 * 1280))
    #len_data = data.shape[0]#hongwei 的改动,因为不明白为何需要 除以1280 取整后再 乘回来1280,数字少的时候结果就直接变0了,无法debug了
    len_data = data.shape[0] // const_folds * const_folds #20156 --> 20150

    f1.close(); f2.close()
    return x_data[:len_data], y_data[:len_data]
Exemplo n.º 6
0
def load_process_data_m():
    f1 = open(neg_file, 'r', encoding='utf-8')
    r1 = csv.reader(f1, delimiter=',', quotechar='"')
    #s.print_data("r1", r1)
    data_list=np.array(list(r1))
    #s.print_data("data_list",data_list)
    data = process_m(data_list) 
    #s.print_data("str_data3",data)
    #s=data.shape
    #s.print_data("s the shape",s)
    #len_data = data.shape[0] // 1280 * 1280
    #len_data = data.shape[0]
    len_data = data.shape[0] // const_folds * const_folds #by hongwei: 20156 --> 20150  

    #print("=========================> len_data is %d" %(len_data))
    x_data = s.normalize(np.delete(data, [5], axis=1))
    #s.print_data("x_data after normalize",x_data)
    y_data = data[:, 5] #只取二维数组的第五列
    #s.print_data("data[:, 5]",y_data)
    #s.print_data("y_data after takiing data[:, 5]", y_data)
    f1.close()
    #s.print_data("x_data[:len_data]",x_data[:len_data])
    return x_data[:len_data], y_data[:len_data]
Exemplo n.º 7
0
def build_index(in_dir, out_dict, out_postings):
    """
    build index from documents stored in the input directory,
    then output the dictionary file and postings file
    """
    print("indexing...")
    # Initializations
    dictionary = {}
    doc_freq = {}

    # Iterate through each file
    # Not directly using files.sorted() because this sorts
    # alphabetically instead of by index number
    files = os.listdir(in_dir)
    file_index = 0
    block_count = 0
    file_count = 0
    while file_index < max_doc_id:
        # For each file, tokenize and normalize the words
        file_index = file_index + 1
        if not str(file_index) in files:
            continue
        file_name = in_dir + "/" + str(file_index)

        reader = open(file_name, 'r')
        # Get the unprocessed words
        content = reader.read()
        reader.close()

        file_count += 1
        words_in_doc = []
        words = nltk.word_tokenize(content)
        for w in words:
            ws = w.split('/')
            for word in ws:
                word = normalize(word)
                if not word == "" and word not in words_in_doc:
                    words_in_doc.append(word)

        for word in words_in_doc:
            if word not in dictionary:
                dictionary[word] = []
                doc_freq[word] = 0
            dictionary[word].append(file_index)
            doc_freq[word] += 1

        # Divide the files into blocks, each block with block_size files
        # index every block by applying BSBI
        if file_count >= block_size:
            block_count += 1
            temp_dict_path = "./temp_dict" + str(block_count) + ".txt"
            temp_posting_path = "./temp_post" + str(block_count) + ".txt"
            bsbi_invert(dictionary, doc_freq, temp_dict_path,
                        temp_posting_path)
            dictionary = {}
            doc_freq = {}
            file_count = 0

    if len(dictionary) >= 1:  # Construct last block using BSBI
        block_count += 1
        temp_dict_path = "./temp_dict" + str(block_count) + ".txt"
        temp_posting_path = "./temp_post" + str(block_count) + ".txt"
        bsbi_invert(dictionary, doc_freq, temp_dict_path, temp_posting_path)

    merge_block(block_count, out_dict, out_postings)
Exemplo n.º 8
0
 def slice_normalize(self, roi):
     roi = [min(roi[0], roi[2]), min(roi[1], roi[3]), max(roi[0], roi[2]), max(roi[1], roi[3])]
     w_ = roi[2] - roi[0]
     h_ = roi[3] - roi[1]
     sliced = np.array(self[roi[1]:roi[1]+h_, roi[0]:roi[0]+w_]).astype(dtype=Config.TYPE_FLOAT)
     return (normalize(sliced)*255.).astype(np.uint8)
Exemplo n.º 9
0
 def copy_normalize(self, signed=False):
     tmppath = path.join(Config.TEMP_DIR, str(uuid.uuid4()))
     copy = DiskMap(tmppath, dtype=Config.TYPE_FLOAT, mode='w+', shape=self.shape)
     copy[:] = self[:]
     normalize(copy, signed)
     return copy