Python pre_processing 예제들, preprocessing.pre_processing Python 예제들

예제 #1

0

파일 보기

def phrase_search(query, inverted_index):
    phrase_search_result_list = []
    query_info = re.findall(r'(\w+)\s"(\w+)\s(\w+)"', query, re.DOTALL)
    #phrase_list[0] phrase_list[1]
    #pre_processing returns a list involving only one element. so pre_processing('str')[0]=='str'
    serial_number = query_info[0][0]
    first_term = pre_processing(query_info[0][1])[0]
    second_term = pre_processing(query_info[0][2])[0]
    if first_term in inverted_index.keys(
    ) and second_term in inverted_index.keys():
        for doc_info_of_first_term in inverted_index[
                first_term]:  #doc_info[0] docID; doc_info[1] list of positions
            for doc_info_of_second_term in inverted_index[second_term]:
                if doc_info_of_first_term[0] == doc_info_of_second_term[0]:
                    for position_of_second_term in doc_info_of_second_term[1]:
                        for position_of_first_term in doc_info_of_first_term[
                                1]:
                            if (position_of_second_term -
                                    position_of_first_term) == 1:
                                phrase_search_result_list.append(
                                    doc_info_of_first_term[0])
                                break
                        else:
                            continue
                        break
                    break
    list_of_docid = [int(docid) for docid in phrase_search_result_list]
    with open("results.boolean.txt", "a") as written_file:
        for docid in sorted(list_of_docid):
            print("%s,%s" % (serial_number, docid), file=written_file)

예제 #2

0

파일 보기

파일: character_detection.py 프로젝트: piiingz/Optical-Character-Recognition

def load_detection_images():
    total_files = glob(DETECTION_PATH + "*.jpg")

    img1 = pre_processing(total_files[0])
    img2 = pre_processing(total_files[1])

    return img1, img2

예제 #3

0

파일 보기

def proximity_search(query, inverted_index):
    query_info = re.findall(r"(\w+)\s#(\d+)\((\w+),\s?(\w+)\)", query,
                            re.DOTALL)
    list_of_docid = []
    #phrase_list[0] phrase_list[1]
    #query_info is a list which involve a 3-D tuple    [(proximity_num, first_term, second_term)]
    serial_number = query_info[0][0]
    proximity_num = int(query_info[0][1])  # str->int
    first_term = pre_processing(query_info[0][2])[0]
    second_term = pre_processing(query_info[0][3])[0]
    if first_term in inverted_index.keys(
    ) and second_term in inverted_index.keys():
        for doc_info_of_first_term in inverted_index[
                first_term]:  #doc_info[0] docID; doc_info[1] list of positions
            for doc_info_of_second_term in inverted_index[second_term]:
                if doc_info_of_first_term[0] == doc_info_of_second_term[0]:
                    for position_of_second_term in doc_info_of_second_term[1]:
                        for position_of_first_term in doc_info_of_first_term[
                                1]:
                            if abs(position_of_second_term -
                                   position_of_first_term) <= proximity_num:
                                list_of_docid.append(doc_info_of_first_term[0])
                                #with open("results.boolean.txt", "a") as written_file:
                                #print("%s, %s" %(serial_number, doc_info_of_first_term[0]), file=written_file)
                                break
                        else:
                            continue
                        break
                    break

    list_of_docid = [int(docid) for docid in list_of_docid]
    if len(list_of_docid) != 0:
        with open("results.boolean.txt", "a") as written_file:
            for docid in sorted(list_of_docid):
                print("%s,%s" % (serial_number, docid), file=written_file)

예제 #4

0

파일 보기

def term_boolean_term(query, inverted_index):

    query_info = re.findall(r"(\w+)\s(\w+)\s(.+)\s(\w+)", query, re.DOTALL)
    serial_number = query_info[0][0]
    first_term = pre_processing(query_info[0][1])[0]
    operator = query_info[0][2]
    second_term = pre_processing(query_info[0][3])[0]
    if first_term in inverted_index.keys(
    ) and second_term in inverted_index.keys():
        list_of_first_term = []
        list_of_second_term = []
        for doc_info_of_first_term in inverted_index[
                first_term]:  #doc_info_of_X_term [0] docID; doc_info_of_X_term[1] list of positions
            list_of_first_term.append(doc_info_of_first_term[0])
        for doc_info_of_second_term in inverted_index[second_term]:
            list_of_second_term.append(doc_info_of_second_term[0])

        if operator == "AND":
            list_of_docid = [
                docid for docid in list_of_first_term
                if docid in list_of_second_term
            ]

        if operator == "OR":
            list_of_docid = list(set(list_of_first_term + list_of_second_term))

        if operator == "AND NOT":
            list_of_docid = [
                docid for docid in list_of_first_term
                if docid not in list_of_second_term
            ]
        #print the search result

        if operator == 'OR NOT':
            all_docid = []
            xml_content = extract_xml_file(
                "trec.5000.xml"
            )  # single_file[0]: docid;   single_file[1]: headline+text
            for single_file in xml_content:
                all_docid.append(single_file[0])
            not_second_term_docid = [
                x for x in all_docid if x not in list_of_second_term
            ]
            list_of_docid = list(
                set(list_of_first_term + not_second_term_docid))

        list_of_docid = [int(docid) for docid in list_of_docid]
        if len(list_of_docid) != 0:
            with open("results.boolean.txt", "a") as written_file:
                for docid in sorted(list_of_docid):
                    print("%s,%s" % (serial_number, docid), file=written_file)

예제 #5

0

파일 보기

파일: inverted_index.py 프로젝트: walkkker/Simple-IR-system

def positional_inverted_index():
    file_tokens_all = []
    file_tokens = []
    file_info = []
    xml_content = extract_xml_file("trec.5000.xml")
    for single_file in xml_content:
        # headline + text   format of file_info: [(docid, [tokens of headline + text]),(),(),(),()]
        file_tokens_all += pre_processing(single_file[1]) + pre_processing(
            single_file[2])
        file_tokens = pre_processing(single_file[1]) + pre_processing(
            single_file[2])
        file_info.append((single_file[0], file_tokens))

    inverted_index = dict()

    set_tokens = set(file_tokens_all)

    for term in set_tokens:
        list_of_term_inverted_index = []
        for single_file in file_info:
            #single_file[0]: docID      single_file[1]: headline   single_file[2]: content
            #single_file[0]:doc_id   single_file[1]: single_file_tokens
            if term in single_file[1]:
                list_of_positions = []
                for i in range(1, len(single_file[1]) + 1):
                    if term == single_file[1][i - 1]:
                        list_of_positions.append(i)
                list_of_term_inverted_index.append(
                    (single_file[0], list_of_positions))

    #the list of tuples [(), (), ()]
        inverted_index[term] = list_of_term_inverted_index

    #Reference: doc_info[0] = docID  ;   doc_info[1] = list of positions
    with open("index.txt", "w") as written_file:
        for key_of_term in sorted(inverted_index):
            print("%s:%i" % (key_of_term, len(inverted_index[key_of_term])),
                  file=written_file)
            for doc_info in inverted_index[key_of_term]:
                print("\t%s:" % doc_info[0],
                      ",".join(str(i) for i in doc_info[1]),
                      file=written_file)  #why is %s
            print(file=written_file)
    return inverted_index

예제 #6

0

파일 보기

def browsefunc():
    global lines
    lines = []
    filename = filedialog.askopenfilename()
    x.append(filename)
    print(x[i])
    lines = preprocessing.pre_processing(x[i])
    print("preparing image .... Done")
    global i
    i += 1

예제 #7

0

파일 보기

def single_term_search(query, inverted_index):
    query_info = re.findall(r"(\w+)\s(\w+)", query, re.DOTALL)
    serial_number = query_info[0][0]
    search_term = pre_processing(query_info[0][1])[0]
    search_term_list = []
    if search_term in inverted_index.keys():
        for doc_info_of_search_term in inverted_index[search_term]:
            search_term_list.append(doc_info_of_search_term[0])

    #if len(search_term_list) != 0:
    with open("results.boolean.txt", "a") as written_file:
        for docid in search_term_list:
            print("%s,%s" % (serial_number, docid), file=written_file)

예제 #8

0

파일 보기

def main(path,pre,mode):
    
    petrinet, init, targets = load_petrinet(path)            
    
    red_petrinet = petrinet
    red_init = init
    
    pre_processing_res = None # if no pre-preprocessing
    if pre:
        time_before = time.time()
        print "**** pre processing ***********"
        red_petrinet, red_init, targets,pre_processing_res = pre_processing(petrinet,init,targets)
        petrinet = red_petrinet
        init = red_init
        time_after = time.time()
        print "pre_processing finished in", time_after - time_before
        print "*******************************"
        
    if pre_processing_res:
        print "The pre-processing found that one target was coverable"
        print "Unsafe"
        exit(0)
    elif pre_processing_res == False:
        print "The pre-processing found that all targets were uncoverable"
        print "Safe"
        exit(0)        
    
    # know we will launch the backward coverability algorithm
    if mode == "qcover":
        result = coverability(petrinet, init, targets, prune=True,max_iter=None)
    elif mode == "limit":
        result = limit_coverability(petrinet, init, targets, prune=True,max_iter=None)
    elif mode  == "comp":
        result = comparable_coverability(petrinet, init, targets,prune=True,max_iter=None)
    elif mode == "hfifos":
        result =  limit_coverability_exploration_heuristic(petrinet, init, targets, "fifos",prune=True,max_iter=None)
    elif mode == "hstacks":
        result =  limit_coverability_exploration_heuristic(petrinet, init, targets, "stacks",prune=True,max_iter=None)
    else:
        print "wrong mode"
        print "choose qcover, limit or comp, hfifos or hstacks"
        exit(2)

    if result is None:
        print 'Unknown'
    elif result:
        print 'Unsafe'
    else:
        print 'Safe'

예제 #9

0

파일 보기

def tfidf_process(inverted_index):
    N = 5000  #total number of documents
    tfidf = {}
    file_info = {}

    xml_content = extract_xml_file("trec.5000.xml")
    for single_file in xml_content:
        # headline + text   format of file_info: [(docid, [tokens of headline + text]),(),(),(),()]
        file_tokens = pre_processing(single_file[1]) + pre_processing(
            single_file[2])
        file_info[single_file[0]] = file_tokens

    # calculate tfidf
    for term in inverted_index.keys(
    ):  # inverted_index[term] len(inverted_index[term]) is the length
        idf_part = math.log(N / len(inverted_index[term]), 10)
        tfidf[term] = {}
        for term_doc in inverted_index[
                term]:  # term_doc[0]: docid  term_doc[1] list of positions  tr: len(term_doc[1])/len(file_info[term_doc[0]])
            tf_part = 1 + math.log(len(term_doc[1]), 10)
            #N/len(inverted_index[term])  idf: math.log(N/len(inverted_index[term]), 10)
            #w(t.d) = tf_part*idf_part
            tfidf[term][term_doc[0]] = tf_part * idf_part
    return tfidf

예제 #10

0

파일 보기

def tfidf_retrieval(ranked_query, tfidf):

    query_info = re.findall(r"(\d+)\s([\sA-Za-z]+)", ranked_query, re.DOTALL)

    serial_number = query_info[0][0]
    query_content = query_info[0][1]
    query = pre_processing(query_content)

    term_doc_list = []
    #this part is designed to process the input
    for query_term in query:  #Get a list of docid involing any number of terms of the query
        term_doc_list += tfidf[query_term].keys()

    doc_result = list(set(term_doc_list))
    docid_score_list = []
    for docid in doc_result:
        docid_score = 0
        for query_term in query:
            if docid in tfidf[query_term].keys():
                docid_score += tfidf[query_term][docid]
        docid_score_list.append((int(docid), docid_score))
    docid_score_list = sorted(docid_score_list, key=lambda x: (-x[1], x[0]))

    # [(,)(,)(,)]
    if len(docid_score_list) <= 150:
        with open("results.ranked.txt", "a") as written_file:
            for doc_score in docid_score_list:  #doc_score[0]:docid  doc_score[1]: tfidf score
                print("%s,%s,%.4f" %
                      (serial_number, doc_score[0], doc_score[1]),
                      file=written_file)

#if the length of the list is greater than 150, we control it to only print out the first 150 elements.
    if len(docid_score_list) > 150:
        with open("results.ranked.txt", "a") as written_file:
            for i in range(150):
                print("%s,%s,%.4f" % (serial_number, docid_score_list[i][0],
                                      docid_score_list[i][1]),
                      file=written_file)

예제 #11

0

파일 보기

def phrase_boolean_term(query, inverted_index):
    phrase_search_result_list = []
    term_search_result_list = []

    query_info = re.findall(r'(\w+)\s"(\w+)\s(\w+)"\s(.+)\s(\w+)', query,
                            re.DOTALL)
    serial_number = query_info[0][0]
    first_term_of_phrase = pre_processing(query_info[0][1])[0]
    second_term_of_phrase = pre_processing(query_info[0][2])[0]
    operator = query_info[0][3]
    second_term = pre_processing(query_info[0][4])[0]
    #query result for phrase
    #ATTENTION: NEED TO EDIT AGAIN usage: calculate the phrase search result list
    phrase_search_result_list = []
    if first_term_of_phrase in inverted_index.keys(
    ) and second_term_of_phrase in inverted_index.keys():
        for doc_info_of_first_term in inverted_index[
                first_term_of_phrase]:  #doc_info[0] docID; doc_info[1] list of positions
            for doc_info_of_second_term in inverted_index[
                    second_term_of_phrase]:
                if doc_info_of_first_term[0] == doc_info_of_second_term[0]:
                    for position_of_second_term in doc_info_of_second_term[1]:
                        for position_of_first_term in doc_info_of_first_term[
                                1]:
                            if (position_of_second_term -
                                    position_of_first_term) == 1:
                                with open("results.boolean.txt",
                                          "a") as written_file:
                                    #print("%s, %s" %(serial_number, doc_info_of_first_term[0]), file=written_file)
                                    phrase_search_result_list.append(
                                        doc_info_of_first_term[0])
                                break
                        else:
                            continue
                        break
                    break

    if len(phrase_search_result_list
           ) != 0 and second_term in inverted_index.keys():
        #query result for single term
        for doc_info_of_second_term in inverted_index[second_term]:
            term_search_result_list.append(doc_info_of_second_term[0])

        if operator == 'AND':
            list_of_docid = [
                docid for docid in phrase_search_result_list
                if docid in term_search_result_list
            ]
        if operator == 'OR':
            list_of_docid = list(
                set(phrase_search_result_list + term_search_result_list)
            )  #set() delete the same element and sort all elements in the list
        if operator == 'AND NOT':
            list_of_docid = [
                docid for docid in phrase_search_result_list
                if docid not in term_search_result_list
            ]
        if operator == 'OR NOT':
            all_docid = []
            xml_content = extract_xml_file(
                "trec.5000.xml"
            )  # single_file[0]: docid;   single_file[1]: headline+text
            for single_file in xml_content:
                all_docid.append(single_file[0])
            not_term_docid = [
                x for x in all_docid if x not in term_search_result_list
            ]
            list_of_docid = list(
                set(phrase_search_result_list + not_term_docid))

        list_of_docid = [int(docid) for docid in list_of_docid]
        if len(list_of_docid) != 0:
            with open("results.boolean.txt", "a") as written_file:
                for docid in sorted(list_of_docid):
                    print("%s,%s" % (serial_number, docid), file=written_file)

예제 #12

0

파일 보기

파일: rent_classifier.py 프로젝트: elisoyoungkim/rent-interest-classifier


if __name__ == '__main__':
    train = pd.read_json(
        "/Users/soyoungkim/Desktop/python_codes/two-sigma/data/train.json")
    test = pd.read_json(
        "/Users/soyoungkim/Desktop/python_codes/two-sigma/data/test.json")

    start_time = time.time()
    train['interest'] = np.where(
        train['interest_level'] == 'high', 1,
        np.where(train['interest_level'] == 'medium', 2, 3))

    important_features = [
        'bathrooms', 'bedrooms', 'price', 'price_room', 'latitude',
        'longitude', 'nb_images', 'nb_features', 'sentiment', 'nb_description',
        'description_len', 'b_counts', 'm_counts', 'b_count_log', 'm_count_log'
    ]

    numerical_features = preprocessing.pre_processing(train)
    processed_test_data = preprocessing.pre_processing(test)
    print('A set of 15 derived features:{0}\n'.format(important_features))
    preprocessing.classification(numerical_features, processed_test_data,
                                 train['interest'])
    ans = rent_interest_classifier()
    ans_dataframe = pd.DataFrame(ans,
                                 columns=['low', 'medium', 'high'],
                                 index=processed_test_data.index)
    ans_dataframe.to_csv('result.csv', index=False)
    print('--- %s seconds ---' % (time.time() - start_time))

예제 #13

0

파일 보기

파일: main.py 프로젝트: czong/Fraud

        inputData = pickle.load(handle)
    print 'load the existing raw dataset from pickle file, finished!'

##########################################################################################
####################     preprocessing     #####################
##########################################################################################
targetName = 'fpd'
missingRateHighBound = 0.5
categoryUpLimit = 40
fillna = 'None'  # options: 'mean', 'median', '-999', 'None'
var_threshold = 0
scale_enable = False  # when cross validation, no scale in the preprocessing stage
write_en = True

inputAfterPre = pre_processing(preprocessing_choice, inputData, targetName,
                               missingRateHighBound, categoryUpLimit, fillna,
                               var_threshold, scale_enable, write_en,
                               preprocess_folder)

##########################################################################################
####################     feature selection     #####################
##########################################################################################
ranking_method = 'rf'
featureNum = 400

inputAfterSelect = feature_select(feature_ranking_choice, ranking_method,
                                  inputAfterPre, targetName,
                                  featureRank_folder, featureNum)

##########################################################################################
####################    hyper parameter search     #####################
##########################################################################################

예제 #14

0

파일 보기

파일: main.py 프로젝트: czong/Fraud

    with open(rawdata_folder+'/rawdata.pickle','rb') as handle:
        inputData = pickle.load(handle)
    print 'load the existing raw dataset from pickle file, finished!'

##########################################################################################
####################     preprocessing     #####################
##########################################################################################
targetName = 'fpd'
missingRateHighBound = 0.5
categoryUpLimit = 40
fillna = 'None'    # options: 'mean', 'median', '-999', 'None'
var_threshold = 0
scale_enable = False  # when cross validation, no scale in the preprocessing stage
write_en = True

inputAfterPre = pre_processing(preprocessing_choice,inputData,targetName,missingRateHighBound,categoryUpLimit,fillna,var_threshold,scale_enable,write_en,preprocess_folder)


##########################################################################################
####################     feature selection     #####################
##########################################################################################
ranking_method = 'rf'
featureNum = 400

inputAfterSelect = feature_select(feature_ranking_choice,ranking_method,inputAfterPre,targetName,featureRank_folder,featureNum)

##########################################################################################
####################    hyper parameter search     #####################
##########################################################################################
classifierList = ['xgb']  # full list of classifiers: ['xgb','gbt','rf','ERT'], more are coming 
maxIter = 100

예제 #15

0

파일 보기

# def calc_accuracy(X,Y):
#     max_vals, max_indices = torch.max(X, 1)
#     acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
#     return acc
    

if __name__ == "__main__":
    ##GPU 사용 시
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_text", required = True, help="Input text file")
    parser.add_argument("--output_text" , required= True, help="output text file")
    args = parser.parse_args()

    device = torch.device("cuda:0")

    pre_processing(args.input_text)

    test_tsv_file = 'test.tsv'

    bert_model , vocab = get_pytorch_kobert_model()

    dataset_test = nlp.data.TSVDataset(test_tsv_file, field_indices=[0], num_discard_samples=0)

    tokenizer = get_tokenizer()
    tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

    ## Setting parameters
    max_len = 20
    batch_size = 64
    
    # data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)