示例#1
0
 def go(self):
     y_train = self.dfAll["relevance"].values[:TRAIN_SIZE]
     for obs_field in self.obs_fields:
         if obs_field not in self.dfAll.columns:
             self.logger.info("Skip %s"%obs_field)
             continue
         obs_corpus = self.dfAll[obs_field].values
         for target_field in self.target_fields:
             if target_field not in self.dfAll.columns:
                 self.logger.info("Skip %s"%target_field)
                 continue
             target_corpus = self.dfAll[target_field].values
             ext = self.generator(obs_corpus, target_corpus, *self.param_list)
             x = ext.transform()
             if isinstance(ext.__name__(), list):
                 for i,feat_name in enumerate(ext.__name__()):
                     dim = 1
                     fname = "%s_%s_x_%s_%dD"%(feat_name, obs_field, target_field, dim)
                     pkl_utils._save(os.path.join(self.feat_dir, fname+config.FEAT_FILE_SUFFIX), x[:,i])
                     corr = np_utils._corr(x[:TRAIN_SIZE,i], y_train)
                     self.logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))
             else:
                 dim = np_utils._dim(x)
                 fname = "%s_%s_x_%s_%dD"%(ext.__name__(), obs_field, target_field, dim)
                 pkl_utils._save(os.path.join(self.feat_dir, fname+config.FEAT_FILE_SUFFIX), x)
                 if dim == 1:
                     corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                     self.logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))
                 elif self.force_corr:
                     for j in range(dim):
                         corr = np_utils._corr(x[:TRAIN_SIZE,j], y_train)
                         self.logger.info("%s (%d/%dD): corr = %.6f"%(fname, j+1, dim, corr))
示例#2
0
def Article2Template(lang="en"):
	print "[%s]: generate article2template dict for language %s" % (time_utils._timestamp(), lang)
	infile = open(config.ARTICLE_TEMPLATES[lang])
	prefix = config.LANG_PREFIX[lang]
	len_prefix = len(prefix)
	articleDict = {}
	for line in infile.readlines():
		if line[0] != "<":
			continue
		row = line.split()
		article = row[0][1:-1]
		template = row[2][1:-1]
		article = article[len_prefix:]
		template = template[len_prefix:]

		if "/" in template:
			continue

		if article in articleDict:
			articleDict[article].append(template)
		else:
			articleDict[article] = [template, ]
	print "%d articles in total" % len(articleDict)
	pkl_utils._save(config.ARTICLE2TEMPLATE[lang], articleDict)
	print "[%s]: generation complete" % time_utils._timestamp()
 def convert(self):
     dfAll = pd.read_csv(self.fname)
     columns_to_drop = ["id", "product_uid", "relevance", "search_term", "product_title"]
     columns_to_drop = [col for col in columns_to_drop if col in dfAll.columns]
     dfAll.drop(columns_to_drop, axis=1, inplace=True)
     for col in dfAll.columns:
         pkl_utils._save("%s/TuringTest_%s_%s.pkl"%(config.FEAT_DIR, self.name, col), dfAll[col].values)
示例#4
0
def main():
    
    dfTrain = pd.read_csv(config.TRAIN_DATA, encoding="ISO-8859-1")
    dfTest = pd.read_csv(config.TEST_DATA, encoding="ISO-8859-1")


    # splits for level1
    splitter = HomedepotSplitter(dfTrain=dfTrain, 
                                dfTest=dfTest, 
                                n_iter=config.N_RUNS, 
                                random_state=config.RANDOM_SEED, 
                                verbose=True,
                                plot=True,
                                # tune these params to get a close distribution
                                split_param=[0.5, 0.25, 0.5],
                                )
    splitter.split()
    splitter.save("%s/splits_level1.pkl"%config.SPLIT_DIR)
    splits_level1 = splitter.splits


    ## splits for level2
    splits_level1 = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
    splits_level2 = [0]*config.N_RUNS
    for run, (trainInd, validInd) in enumerate(splits_level1):
        dfValid = dfTrain.iloc[validInd].copy()
        splitter2 = HomedepotSplitter(dfTrain=dfValid, 
                                    dfTest=dfTest, 
                                    n_iter=1, 
                                    random_state=run, 
                                    verbose=True,
                                    # tune these params to get a close distribution
                                    split_param=[0.5, 0.15, 0.6])
        splitter2.split()
        splits_level2[run] = splitter2.splits[0]
    pkl_utils._save("%s/splits_level2.pkl"%config.SPLIT_DIR, splits_level2)


    ## splits for level3
    splits_level2 = pkl_utils._load("%s/splits_level2.pkl"%config.SPLIT_DIR)
    splits_level3 = [0]*config.N_RUNS
    for run, (trainInd, validInd) in enumerate(splits_level2):
        dfValid = dfTrain.iloc[validInd].copy()
        splitter3 = HomedepotSplitter(dfTrain=dfValid, 
                                    dfTest=dfTest, 
                                    n_iter=1, 
                                    random_state=run, 
                                    verbose=True,
                                    # tune these params to get a close distribution
                                    split_param=[0.5, 0.15, 0.7])
        splitter3.split()
        splits_level3[run] = splitter3.splits[0]
    pkl_utils._save("%s/splits_level3.pkl"%config.SPLIT_DIR, splits_level3)
def main():
    fnames = [
        "TSNE_LSA100_Word_Unigram_Pair_search_term_x_product_title_100D",
        "TSNE_LSA100_Word_Bigram_Pair_search_term_x_product_title_100D",
        "TSNE_LSA100_Word_Obs_Unigram_Target_Unigram_Cooc_search_term_x_product_title_100D",
        "TSNE_LSA100_Word_Obs_Unigram_Target_Bigram_Cooc_search_term_x_product_title_100D",
    ]

    fnames = [os.path.join(config.FEAT_DIR, fname+".csv") for fname in fnames]

    for fname in fnames:
        df = pd.read_csv(fname, index=False)
        f = df.values
        pkl_utils._save(fname[:-4]+".pkl", f)
 def save(self):
     data_dict = {
         "X_train_basic": self.X_train_basic,
         "y_train_cv": self.y_train_cv,
         "X_train_cv": self.X_train_cv,
         "X_test": self.X_test,                    
         "id_test": self.id_test,
         "splitter_prev": self.splitter_prev,
         "splitter": self.splitter,
         "n_iter": self.n_iter,
         "has_basic": self.has_basic,
     }
     fname = os.path.join(config.FEAT_DIR+"/Combine", self.feature_name+config.FEAT_FILE_SUFFIX)
     pkl_utils._save(fname, data_dict)
     self.logger.info("Save to %s" % fname)
def main():
    logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    obs_corpus = []
    query_suffix = []
    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("raw")
    # after processing    
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("lemmatized")
    # after extracting product_name in search_term
    obs_corpus.append(dfAll["search_term_product_name"].values)
    query_suffix.append("product_name")
    if "search_term_auto_corrected" in dfAll.columns:
        # after auto correction
        obs_corpus.append(dfAll["search_term_auto_corrected"].values)
        query_suffix.append("corrected")  
    # after stemming
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("stemmed")

    y_train = dfAll["relevance"].values[:TRAIN_SIZE]
    for i in range(len(query_suffix)-1):
        for j in range(i+1, len(query_suffix)):
            ext = QueryQuality(obs_corpus[i], obs_corpus[j])
            x = ext.transform()
            dim = 1
            fname = "%s_%s_x_%s_%dD"%(ext._get_feat_name(), query_suffix[i], query_suffix[j], dim)
            pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x)
            corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
            logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))

    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_fields = ["search_term"]
    param_list = []
    sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
    sf.go()
示例#8
0
 def go(self):
     y_train = self.dfAll["is_duplicate"].values[:TRAIN_SIZE]
     for obs_field in self.obs_fields:
         if obs_field not in self.dfAll.columns:
             self.logger.info("Skip %s" % obs_field)
             continue
         obs_corpus = self.dfAll[obs_field].values
         for target_field in self.target_fields:
             if target_field not in self.dfAll.columns:
                 self.logger.info("Skip %s" % target_field)
                 continue
             target_corpus = self.dfAll[target_field].values
             ext = self.generator(obs_corpus, target_corpus,
                                  *self.param_list)
             x = ext.transform()
             if isinstance(ext.__name__(), list):
                 for i, feat_name in enumerate(ext.__name__()):
                     dim = 1
                     fname = "%s_%s_x_%s_%dD" % (feat_name, obs_field,
                                                 target_field, dim)
                     pkl_utils._save(
                         os.path.join(self.feat_dir,
                                      fname + config.FEAT_FILE_SUFFIX),
                         x[:, i])
                     corr = np_utils._corr(x[:TRAIN_SIZE, i], y_train)
                     self.logger.info("%s (%dD): corr = %.6f" %
                                      (fname, dim, corr))
             else:
                 dim = np_utils._dim(x)
                 fname = "%s_%s_x_%s_%dD" % (ext.__name__(), obs_field,
                                             target_field, dim)
                 pkl_utils._save(
                     os.path.join(self.feat_dir,
                                  fname + config.FEAT_FILE_SUFFIX), x)
                 if dim == 1:
                     corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                     self.logger.info("%s (%dD): corr = %.6f" %
                                      (fname, dim, corr))
                 elif self.force_corr:
                     for j in range(dim):
                         corr = np_utils._corr(x[:TRAIN_SIZE, j], y_train)
                         self.logger.info("%s (%d/%dD): corr = %.6f" %
                                          (fname, j + 1, dim, corr))
def main():
    logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    obs_corpus = []
    query_suffix = []
    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("raw")
    # after processing    
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("lemmatized")
    # after extracting product_name in search_term
    obs_corpus.append(dfAll["search_term_product_name"].values)
    query_suffix.append("product_name")
    if "search_term_auto_corrected" in dfAll.columns:
        # after auto correction
        obs_corpus.append(dfAll["search_term_auto_corrected"].values)
        query_suffix.append("corrected")  
    # after stemming
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("stemmed")

    y_train = dfAll["relevance"].values[:TRAIN_SIZE]
    for i in range(len(query_suffix)-1):
        for j in range(i+1, len(query_suffix)):
            ext = QueryQuality(obs_corpus[i], obs_corpus[j])
            x = ext.transform()
            dim = np_utils._dim(x)
            fname = "%s_%s_x_%s_%dD"%(ext.__name__(), query_suffix[i], query_suffix[j], dim)
            pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x)
            corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
            logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))

    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_fields = ["search_term"]
    param_list = []
    sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
    sf.go()
示例#10
0
def main():

    dfTrain = pd.read_csv(config.TRAIN_DATA)
    dfTest = pd.read_csv(config.TEST_DATA)

    # splits for level1
    splitter = QuoraSplitter(dfTrain=dfTrain,
                             dfTest=dfTest,
                             n_iter=config.N_RUNS,
                             random_state=config.RANDOM_SEED,
                             verbose=True)
    splitter.split()
    splitter.save(config.SPLIT_DIR + "/splits_level1.pkl")

    ## splits for level2
    splits_level1 = pkl_utils._load(config.SPLIT_DIR + "/splits_level1.pkl")
    splits_level2 = [0] * config.N_RUNS
    for run, (trainInd, validInd) in enumerate(splits_level1):
        dfValid = dfTrain.iloc[validInd].copy()
        splitter2 = QuoraSplitter(dfTrain=dfValid,
                                  dfTest=dfTest,
                                  n_iter=config.N_RUNS,
                                  random_state=run,
                                  verbose=True)
        splitter2.split()
        splits_level2[run] = splitter2.splits[-1]
        pkl_utils._save(config.SPLIT_DIR + "/splits_level2.pkl", splits_level2)

    ## splits for level3
    splits_level2 = pkl_utils._load(config.SPLIT_DIR + "/splits_level2.pkl")
    splits_level3 = [0] * config.N_RUNS
    for run, (trainInd, validInd) in enumerate(splits_level2):
        dfValid = dfTrain.iloc[validInd].copy()
        splitter3 = QuoraSplitter(dfTrain=dfValid,
                                  dfTest=dfTest,
                                  n_iter=config.N_RUNS,
                                  random_state=run,
                                  verbose=True)
        splitter3.split()
        splits_level3[run] = splitter3.splits[-1]
        pkl_utils._save(config.SPLIT_DIR + "/splits_level3.pkl", splits_level3)
示例#11
0
def main():
	print "[%s]: generate ontology hierarchy tree" % (time_utils._timestamp())
	G = g.Graph()
	G.parse(config.ONTOLOGY, format="n3")

	q = '''
PREFIX rr: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?child ?parent
WHERE {
	?child rr:subClassOf ?parent .
}'''
	
	results = G.query(q)
	ontologyDict = {}
	for row in results:
		child = str(row[0])
		parent = str(row[1])
		if parent in ontologyDict:
			ontologyDict[parent].append(child)
		else:
			ontologyDict[parent] = [child,]
	pkl_utils._save(config.ONTOLOGY_TREE, ontologyDict)
	print "[%s]: generation complete" % time_utils._timestamp()
示例#12
0
def getILL(lang, target):
	print "[%s]: generate ILL dict from language %s to language %s" % (time_utils._timestamp(), lang, target)
	infile = open(config.ILL[lang])
	prefix1 = config.LANG_PREFIX[lang]
	prefix2 = config.LANG_PREFIX[target]
	len1 = len(prefix1)
	len2 = len(prefix2)
	linkDict = {}
	for line in infile.readlines():
		if line[0] != "<":
			continue
		row = line.split()
		lang1 = row[0][1:-1]
		lang2 = row[2][1:-1]
		if prefix1 not in lang1:
			continue
		if prefix2 not in lang2:
			continue
		lang1 = lang1[len1:]
		lang2 = lang2[len2:]
		linkDict[lang1] = lang2
	print "%d links in total" % len(linkDict)
	pkl_utils._save(config.ILL_DICT["%s2%s" % (lang, target)], linkDict)
	print "[%s]: generation complete" % time_utils._timestamp()
def main():
    FNAME = "feature_text"
    logname = "%s_%s.log" % (FNAME, now)
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    stop_words = set(stopwords.words('russian'))

    train, test = dl.load_data()

    logger.info("Generating title & description text features ...")
    t0 = time()
    # Generating text features for title
    tfidf_title = TfidfVectorizer(stop_words=stop_words,
                                  max_features=config.MAX_TEXT_FEATURES)
    tfidf_description = TfidfVectorizer(stop_words=stop_words,
                                        max_features=config.MAX_TEXT_FEATURES)

    train['description'] = train['description'].fillna(' ')
    test['description'] = test['description'].fillna(' ')
    train['title'] = train['title'].fillna(' ')
    test['title'] = test['title'].fillna(' ')
    tfidf_title.fit(pd.concat([train['description'], train['description']]))
    tfidf_description.fit(pd.concat([test['title'], test['title']]))

    train_title_tfidf = tfidf_title.transform(train['title'])
    test_title_tfidf = tfidf_title.transform(test['title'])

    train_description_tfidf = tfidf_description.transform(train['description'])
    test_description_tfidf = tfidf_description.transform(test['description'])

    svd_title = TruncatedSVD(n_components=config.SVD_N_COMP,
                             algorithm='arpack')
    svd_title.fit(
        tfidf_title.transform(pd.concat([train['title'], test['title']])))

    svd_description = TruncatedSVD(n_components=config.SVD_N_COMP,
                                   algorithm='arpack')
    svd_description.fit(
        tfidf_description.transform(
            pd.concat([train['description'], test['description']])))

    train_description_svd = pd.DataFrame(
        svd_description.transform(train_description_tfidf))
    test_description_svd = pd.DataFrame(
        svd_description.transform(test_description_tfidf))
    train_description_svd.columns = [
        'svd_description_' + str(i + 1) for i in range(config.SVD_N_COMP)
    ]
    test_description_svd.columns = [
        'svd_description_' + str(i + 1) for i in range(config.SVD_N_COMP)
    ]

    train_title_svd = pd.DataFrame(svd_title.transform(train_title_tfidf))
    test_title_svd = pd.DataFrame(svd_title.transform(test_title_tfidf))
    train_title_svd.columns = [
        'svd_title_' + str(i + 1) for i in range(config.SVD_N_COMP)
    ]
    test_title_svd.columns = [
        'svd_title_' + str(i + 1) for i in range(config.SVD_N_COMP)
    ]
    gc.collect()

    logger.info(FNAME + ' took: %s minutes' % round((time() - t0) / 60, 1))

    logger.info('Train SVD title shape: %s & Test SVD title shape: %s' %
                (train_title_svd.shape, test_title_svd.shape))
    logger.info(
        'Train SVD description shape: %s & Test SVD description shape: %s' %
        (train_description_svd.shape, test_description_svd.shape))

    # save data
    train_fname = os.path.join(config.DATA_FEATURES_DIR,
                               "train_" + FNAME + config.FEAT_FILE_SUFFIX)
    test_fname = os.path.join(config.DATA_FEATURES_DIR,
                              "test_" + FNAME + config.FEAT_FILE_SUFFIX)
    logger.info("Save to %s" % train_fname)
    pkl_utils._save(
        train_fname, pd.concat([train_title_svd, train_description_svd],
                               axis=1))
    logger.info("Save to %s" % test_fname)
    pkl_utils._save(test_fname,
                    pd.concat([test_title_svd, test_description_svd], axis=1))
    gc.collect()
def main():
    logname = "generate_feature_group_distance_stat_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    y_train = dfAll["relevance"].values[:TRAIN_SIZE]

    group_id_names = ["DocId_search_term", "DocId_product_title", "DocIdEcho_product_uid"]

    match_list = [
    "MatchQueryCount",
    "MatchQueryRatio",
    "LongestMatchRatio",
    ]

    tfidf_list = [
    "StatCoocTF_Unigram_Mean", 
    "StatCoocTF_Unigram_Max",
    "StatCoocTF_Unigram_Min",
    # "StatCoocNormTF_Unigram_Mean", 
    # "StatCoocNormTF_Unigram_Max",
    # "StatCoocNormTF_Unigram_Min", 
    "StatCoocTFIDF_Unigram_Mean",
    "StatCoocTFIDF_Unigram_Max",
    "StatCoocTFIDF_Unigram_Min",
    "StatCoocBM25_Unigram_Mean",
    "StatCoocBM25_Unigram_Max",
    "StatCoocBM25_Unigram_Min",
    # "StatCoocTF_Bigram_Mean", 
    # "StatCoocTF_Bigram_Max",
    # "StatCoocTF_Bigram_Min",
    # "StatCoocNormTF_Bigram_Mean", 
    # "StatCoocNormTF_Bigram_Max",
    # "StatCoocNormTF_Bigram_Min",
    # "StatCoocTFIDF_Bigram_Mean",
    # "StatCoocTFIDF_Bigram_Max",
    # "StatCoocTFIDF_Bigram_Min",
    # "StatCoocBM25_Bigram_Mean",
    # "StatCoocBM25_Bigram_Max",
    # "StatCoocBM25_Bigram_Min",
    # "StatCoocTF_Trigram_Mean", 
    # "StatCoocTF_Trigram_Max",
    # "StatCoocTF_Trigram_Min",
    # "StatCoocNormTF_Trigram_Mean", 
    # "StatCoocNormTF_Trigram_Max",
    # "StatCoocNormTF_Trigram_Min", 
    # "StatCoocTFIDF_Trigram_Mean",
    # "StatCoocTFIDF_Trigram_Max",
    # "StatCoocTFIDF_Trigram_Min",
    # "StatCoocBM25_Trigram_Mean",
    # "StatCoocBM25_Trigram_Max",
    # "StatCoocBM25_Trigram_Min",
    ]
    intersect_ngram_count_list = [    
    "IntersectCount_Unigram", 
    "IntersectRatio_Unigram", 
    # "IntersectCount_Bigram", 
    # "IntersectRatio_Bigram", 
    # "IntersectCount_Trigram", 
    # "IntersectRatio_Trigram", 
    ]
    first_last_ngram_list = [
    "FirstIntersectCount_Unigram", 
    "FirstIntersectRatio_Unigram", 
    "LastIntersectCount_Unigram", 
    "LastIntersectRatio_Unigram",
    # "FirstIntersectCount_Bigram", 
    # "FirstIntersectRatio_Bigram", 
    # "LastIntersectCount_Bigram", 
    # "LastIntersectRatio_Bigram",
    # "FirstIntersectCount_Trigram", 
    # "FirstIntersectRatio_Trigram", 
    # "LastIntersectCount_Trigram", 
    # "LastIntersectRatio_Trigram",
    ]

    cooccurrence_ngram_count_list = [
    "CooccurrenceCount_Unigram", 
    "CooccurrenceRatio_Unigram", 
    # "CooccurrenceCount_Bigram", 
    # "CooccurrenceRatio_Bigram",
    # "CooccurrenceCount_Trigram", 
    # "CooccurrenceRatio_Trigram",
    ]

    ngram_jaccard_list = [
    "JaccardCoef_Unigram", 
    # "JaccardCoef_Bigram", 
    # "JaccardCoef_Trigram", 
    "DiceDistance_Unigram", 
    # "DiceDistance_Bigram", 
    # "DiceDistance_Trigram", 
    ]

    char_dist_sim_list = [
    "CharDistribution_CosineSim",
    "CharDistribution_KL",
    ]

    tfidf_word_ngram_cosinesim_list = [
    "TFIDF_Word_Unigram_CosineSim",
    # "TFIDF_Word_Bigram_CosineSim",
    # "TFIDF_Word_Trigram_CosineSim",
    ]
    tfidf_char_ngram_cosinesim_list = [
    # "TFIDF_Char_Bigram_CosineSim",
    # "TFIDF_Char_Trigram_CosineSim",
    "TFIDF_Char_Fourgram_CosineSim",
    # "TFIDF_Char_Fivegram_CosineSim",
    ]

    lsa_word_ngram_cosinesim_list = [
    "LSA100_Word_Unigram_CosineSim",
    # "LSA100_Word_Bigram_CosineSim",
    # "LSA100_Word_Trigram_CosineSim",
    ]
    lsa_char_ngram_cosinesim_list = [
    # "LSA100_Char_Bigram_CosineSim",
    # "LSA100_Char_Trigram_CosineSim",
    "LSA100_Char_Fourgram_CosineSim",
    # "LSA100_Char_Fivegram_CosineSim",
    ]

    doc2vec_list = [
    "Doc2Vec_Homedepot_D100_CosineSim",
    ]

    word2vec_list = [
    "Word2Vec_N_Similarity",
    "Word2Vec_Homedepot_D100_CosineSim_Mean_Mean",
    "Word2Vec_Homedepot_D100_CosineSim_Max_Mean",
    "Word2Vec_Homedepot_D100_CosineSim_Min_Mean",
    ]

    distance_generator_list = \
    match_list + \
    tfidf_list + \
    intersect_ngram_count_list + \
    first_last_ngram_list + \
    cooccurrence_ngram_count_list + \
    ngram_jaccard_list + \
    tfidf_word_ngram_cosinesim_list + \
    tfidf_char_ngram_cosinesim_list + \
    lsa_word_ngram_cosinesim_list + \
    lsa_char_ngram_cosinesim_list + \
    char_dist_sim_list + \
    word2vec_list + \
    doc2vec_list

    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term"] )
    target_fields_list.append( ["product_title", "product_title_product_name"] )
    aggregation_mode = ["mean", "max", "min"]
    for group_id_name in group_id_names:
        group_id_list = pkl_utils._load(os.path.join(config.FEAT_DIR, group_id_name+"_1D.pkl"))
        for distance_generator in distance_generator_list:
            for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
                for obs_field in obs_fields:
                    for target_field in target_fields:
                        dist_name = "%s_%s_x_%s"%(distance_generator, obs_field, target_field)
                        try:
                            dist_list = pkl_utils._load(os.path.join(config.FEAT_DIR, dist_name+"_1D.pkl"))
                            ext = GroupDistanceStat(dist_list, group_id_list, dist_name, group_id_name, aggregation_mode)
                            x = ext.transform()
                            if isinstance(ext.__name__(), list):
                                for i,feat_name in enumerate(ext.__name__()):
                                    dim = 1
                                    fname = "%s_%dD"%(feat_name, dim)
                                    pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x[:,i])
                                    corr = np_utils._corr(x[:TRAIN_SIZE,i], y_train)
                                    logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))
                        except:
                            logger.info("Skip %s"%dist_name)
                            pass
示例#15
0
def main():
    logname = "generate_feature_group_distance_stat_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    y_train = dfAll["relevance"].values[:TRAIN_SIZE]

    group_id_names = [
        "DocId_search_term", "DocId_product_title", "DocIdEcho_product_uid"
    ]

    match_list = [
        "MatchQueryCount",
        "MatchQueryRatio",
        "LongestMatchRatio",
    ]

    tfidf_list = [
        "StatCoocTF_Unigram_Mean",
        "StatCoocTF_Unigram_Max",
        "StatCoocTF_Unigram_Min",
        # "StatCoocNormTF_Unigram_Mean",
        # "StatCoocNormTF_Unigram_Max",
        # "StatCoocNormTF_Unigram_Min",
        "StatCoocTFIDF_Unigram_Mean",
        "StatCoocTFIDF_Unigram_Max",
        "StatCoocTFIDF_Unigram_Min",
        "StatCoocBM25_Unigram_Mean",
        "StatCoocBM25_Unigram_Max",
        "StatCoocBM25_Unigram_Min",
        # "StatCoocTF_Bigram_Mean",
        # "StatCoocTF_Bigram_Max",
        # "StatCoocTF_Bigram_Min",
        # "StatCoocNormTF_Bigram_Mean",
        # "StatCoocNormTF_Bigram_Max",
        # "StatCoocNormTF_Bigram_Min",
        # "StatCoocTFIDF_Bigram_Mean",
        # "StatCoocTFIDF_Bigram_Max",
        # "StatCoocTFIDF_Bigram_Min",
        # "StatCoocBM25_Bigram_Mean",
        # "StatCoocBM25_Bigram_Max",
        # "StatCoocBM25_Bigram_Min",
        # "StatCoocTF_Trigram_Mean",
        # "StatCoocTF_Trigram_Max",
        # "StatCoocTF_Trigram_Min",
        # "StatCoocNormTF_Trigram_Mean",
        # "StatCoocNormTF_Trigram_Max",
        # "StatCoocNormTF_Trigram_Min",
        # "StatCoocTFIDF_Trigram_Mean",
        # "StatCoocTFIDF_Trigram_Max",
        # "StatCoocTFIDF_Trigram_Min",
        # "StatCoocBM25_Trigram_Mean",
        # "StatCoocBM25_Trigram_Max",
        # "StatCoocBM25_Trigram_Min",
    ]
    intersect_ngram_count_list = [
        "IntersectCount_Unigram",
        "IntersectRatio_Unigram",
        # "IntersectCount_Bigram",
        # "IntersectRatio_Bigram",
        # "IntersectCount_Trigram",
        # "IntersectRatio_Trigram",
    ]
    first_last_ngram_list = [
        "FirstIntersectCount_Unigram",
        "FirstIntersectRatio_Unigram",
        "LastIntersectCount_Unigram",
        "LastIntersectRatio_Unigram",
        # "FirstIntersectCount_Bigram",
        # "FirstIntersectRatio_Bigram",
        # "LastIntersectCount_Bigram",
        # "LastIntersectRatio_Bigram",
        # "FirstIntersectCount_Trigram",
        # "FirstIntersectRatio_Trigram",
        # "LastIntersectCount_Trigram",
        # "LastIntersectRatio_Trigram",
    ]

    cooccurrence_ngram_count_list = [
        "CooccurrenceCount_Unigram",
        "CooccurrenceRatio_Unigram",
        # "CooccurrenceCount_Bigram",
        # "CooccurrenceRatio_Bigram",
        # "CooccurrenceCount_Trigram",
        # "CooccurrenceRatio_Trigram",
    ]

    ngram_jaccard_list = [
        "JaccardCoef_Unigram",
        # "JaccardCoef_Bigram",
        # "JaccardCoef_Trigram",
        "DiceDistance_Unigram",
        # "DiceDistance_Bigram",
        # "DiceDistance_Trigram",
    ]

    char_dist_sim_list = [
        "CharDistribution_CosineSim",
        "CharDistribution_KL",
    ]

    tfidf_word_ngram_cosinesim_list = [
        "TFIDF_Word_Unigram_CosineSim",
        # "TFIDF_Word_Bigram_CosineSim",
        # "TFIDF_Word_Trigram_CosineSim",
    ]
    tfidf_char_ngram_cosinesim_list = [
        # "TFIDF_Char_Bigram_CosineSim",
        # "TFIDF_Char_Trigram_CosineSim",
        "TFIDF_Char_Fourgram_CosineSim",
        # "TFIDF_Char_Fivegram_CosineSim",
    ]

    lsa_word_ngram_cosinesim_list = [
        "LSA100_Word_Unigram_CosineSim",
        # "LSA100_Word_Bigram_CosineSim",
        # "LSA100_Word_Trigram_CosineSim",
    ]
    lsa_char_ngram_cosinesim_list = [
        # "LSA100_Char_Bigram_CosineSim",
        # "LSA100_Char_Trigram_CosineSim",
        "LSA100_Char_Fourgram_CosineSim",
        # "LSA100_Char_Fivegram_CosineSim",
    ]

    doc2vec_list = [
        "Doc2Vec_Homedepot_D100_CosineSim",
    ]

    word2vec_list = [
        "Word2Vec_N_Similarity",
        "Word2Vec_Homedepot_D100_CosineSim_Mean_Mean",
        "Word2Vec_Homedepot_D100_CosineSim_Max_Mean",
        "Word2Vec_Homedepot_D100_CosineSim_Min_Mean",
    ]

    distance_generator_list = \
    match_list + \
    tfidf_list + \
    intersect_ngram_count_list + \
    first_last_ngram_list + \
    cooccurrence_ngram_count_list + \
    ngram_jaccard_list + \
    tfidf_word_ngram_cosinesim_list + \
    tfidf_char_ngram_cosinesim_list + \
    lsa_word_ngram_cosinesim_list + \
    lsa_char_ngram_cosinesim_list + \
    char_dist_sim_list + \
    word2vec_list + \
    doc2vec_list

    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append(["search_term"])
    target_fields_list.append(["product_title", "product_title_product_name"])
    aggregation_mode = ["mean", "max", "min"]
    for group_id_name in group_id_names:
        group_id_list = pkl_utils._load(
            os.path.join(config.FEAT_DIR, group_id_name + "_1D.pkl"))
        for distance_generator in distance_generator_list:
            for obs_fields, target_fields in zip(obs_fields_list,
                                                 target_fields_list):
                for obs_field in obs_fields:
                    for target_field in target_fields:
                        dist_name = "%s_%s_x_%s" % (distance_generator,
                                                    obs_field, target_field)
                        try:
                            dist_list = pkl_utils._load(
                                os.path.join(config.FEAT_DIR,
                                             dist_name + "_1D.pkl"))
                            ext = GroupDistanceStat(dist_list, group_id_list,
                                                    dist_name, group_id_name,
                                                    aggregation_mode)
                            x = ext.transform()
                            if isinstance(ext.__name__(), list):
                                for i, feat_name in enumerate(ext.__name__()):
                                    dim = 1
                                    fname = "%s_%dD" % (feat_name, dim)
                                    pkl_utils._save(
                                        os.path.join(
                                            config.FEAT_DIR,
                                            fname + config.FEAT_FILE_SUFFIX),
                                        x[:, i])
                                    corr = np_utils._corr(
                                        x[:TRAIN_SIZE, i], y_train)
                                    logger.info("%s (%dD): corr = %.6f" %
                                                (fname, dim, corr))
                        except:
                            logger.info("Skip %s" % dist_name)
                            pass
def parse(lang="en"):
	_log.info("starting parsing")
	infile = open(config.INSTANCE_TYPES[lang])
	rdf_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
	type_entries = []
	entitySet = set()
	typeSet = set()
	for line in infile.readlines():
		if line[0] != "<":
			continue
		row = line.split()
		instance = row[0][1:-1]
		ontology = row[2][1:-1]
		type_entries.append((instance, ontology))
		entitySet.add(ontology)
		typeSet.add(ontology)
	typeDict = {y:x for x, y in enumerate(typeSet)}
	infile.close()

	cnt_type = len(entitySet)
	_log.info("%d types" % cnt_type)

	infile = open(config.OBJECTS[lang])
	relationDict = {}
	instanceSet = set()
	for line in infile.readlines():
		if line[0] != "<":
			continue
		row = line.split()
		subject = row[0][1:-1]
		predicate = row[1][1:-1]
		target = row[2][1:-1]
		entitySet.add(subject)
		entitySet.add(target)
		instanceSet.add(subject)
		instanceSet.add(target)
		if predicate in relationDict:
			relationDict[predicate].append((subject, target))
		else:
			relationDict[predicate] = [(subject, target)]
	instanceDict = {y:x for x, y in enumerate(instanceSet)}
	entityDict = {y:x for x, y in enumerate(entitySet)}
	infile.close()

	cnt_ins = len(instanceSet)
	N = len(entitySet)
	_log.info("%d instanes" % cnt_ins)
	_log.info("%d entites" % N)
	
	tensor = []
	predicateDict = {}
	cnt = 0
	for predicate in relationDict:
		entries = relationDict[predicate]
		rows = [entityDict[entry[0]] for entry in entries]
		cols = [entityDict[entry[1]] for entry in entries]
		data = [1 for entry in entries]
		mat = spsp.csr_matrix((data, (rows, cols)), (N, N))
		tensor.append(mat)
		predicateDict[predicate] = cnt
		cnt += 1
	type_entries = [entry for entry in type_entries if entry[0] in instanceSet]
	rows = [entityDict[entry[0]] for entry in type_entries]
	cols = [entityDict[entry[1]] for entry in type_entries]
	data = [1 for entry in type_entries]
	mat = spsp.csr_matrix((data, (rows, cols)), (N, N))
	tensor.append(mat)
	predicateDict[rdf_type] = cnt
	_log.info("%d relations" % (cnt+1))
	pkl_utils._save(config.TENSOR[lang], tensor)
	pkl_utils._save(config.ENTITY[lang], entityDict)
	pkl_utils._save(config.PREDICATE[lang], predicateDict)
	pkl_utils._save(config.INSTANCE[lang], instanceDict)
	pkl_utils._save(config.TYPE[lang], typeDict)
	pkl_utils._save(config.TYPE_MATRIX[lang], (rows, cols))
	_log.info("parsing complete")
def main():
    # load provided data
    dfTrain = pd.read_csv(config.TRAIN_DATA, encoding="ISO-8859-1")
    dfTest = pd.read_csv(config.TEST_DATA, encoding="ISO-8859-1")
    dfAttr = pd.read_csv(config.ATTR_DATA)
    dfDesc = pd.read_csv(config.DESC_DATA)

    #
    print("Train Mean: %.6f" % np.mean(dfTrain["relevance"]))
    print("Train Var: %.6f" % np.var(dfTrain["relevance"]))

    #
    dfTest["relevance"] = np.zeros((config.TEST_SIZE))
    dfAttr.dropna(how="all", inplace=True)
    dfAttr["value"] = dfAttr["value"].astype(str)

    # concat train and test
    dfAll = pd.concat((dfTrain, dfTest), ignore_index=True)
    del dfTrain
    del dfTest
    gc.collect()

    # merge product description
    dfAll = pd.merge(dfAll, dfDesc, on="product_uid", how="left")
    dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True)
    del dfDesc
    gc.collect()

    # merge product brand
    dfBrand = dfAttr[dfAttr.name == "MFG Brand Name"][[
        "product_uid", "value"
    ]].rename(columns={"value": "product_brand"})
    dfAll = pd.merge(dfAll, dfBrand, on="product_uid", how="left")
    dfBrand["product_brand"] = dfBrand["product_brand"].values.astype(str)
    dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True)
    del dfBrand
    gc.collect()

    # merge product color
    color_columns = [
        "product_color", "Color Family", "Color/Finish", "Color/Finish Family"
    ]
    dfColor = dfAttr[dfAttr.name.isin(color_columns)][[
        "product_uid", "value"
    ]].rename(columns={"value": "product_color"})
    dfColor.dropna(how="all", inplace=True)
    _agg_color = lambda df: " ".join(list(set(df["product_color"])))
    dfColor = dfColor.groupby("product_uid").apply(_agg_color)
    dfColor = dfColor.reset_index(name="product_color")
    dfColor["product_color"] = dfColor["product_color"].values.astype(str)
    dfAll = pd.merge(dfAll, dfColor, on="product_uid", how="left")
    dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True)
    del dfColor
    gc.collect()

    # merge product attribute
    _agg_attr = lambda df: config.ATTR_SEPARATOR.join(df[
        "name"] + config.ATTR_SEPARATOR + df["value"])
    dfAttr = dfAttr.groupby("product_uid").apply(_agg_attr)
    dfAttr = dfAttr.reset_index(name="product_attribute_concat")
    dfAll = pd.merge(dfAll, dfAttr, on="product_uid", how="left")
    dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True)
    del dfAttr
    gc.collect()

    # save data
    if config.TASK == "sample":
        dfAll = dfAll.iloc[:config.SAMPLE_SIZE].copy()
    pkl_utils._save(config.ALL_DATA_RAW, dfAll)

    # info
    dfInfo = dfAll[["id", "relevance"]].copy()
    pkl_utils._save(config.INFO_DATA, dfInfo)
示例#18
0
def main():

    ###########
    ## Setup ##
    ###########
    logname = "data_processor_%s.log" % now
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    # put product_attribute_list, product_attribute and product_description first as they are
    # quite time consuming to process
    columns_to_proc = [
        "question1",
        "question2",
    ]
    if config.PLATFORM == "Linux":
        config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc)

    # clean using a list of processors
    processors = [
        UnicodeConverter(),
        LowerCaseConverter(),
        # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here
        UnitConverter(),
        LowerUpperCaseSplitter(),
        # WordReplacer(replace_fname=config.WORD_REPLACER_DATA),
        LetterLetterSplitter(),
        DigitLetterSplitter(),
        DigitCommaDigitMerger(),
        NumberDigitMapper(),
        UnitConverter(),
        QuartetCleaner(),
        HtmlCleaner(parser="html.parser"),
        Lemmatizer(),
    ]
    stemmers = [
        Stemmer(stemmer_type="snowball"),
        Stemmer(stemmer_type="porter")
    ][0:1]

    ## simple test
    text = "1/2 inch rubber lep tips Bullet07"
    print("Original:")
    print(text)
    list_processor = ListProcessor(processors)
    print("After:")
    print(list_processor.process([text]))

    #############
    ## Process ##
    #############
    ## load raw data
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns]

    ## clean uisng a list of processors
    df_processor = DataFrameParallelProcessor(processors,
                                              config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # save data
    logger.info("Save to %s" % config.ALL_DATA_LEMMATIZED)
    pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll)

    ## clean using stemmers
    df_processor = DataFrameParallelProcessor(stemmers,
                                              config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # save data
    logger.info("Save to %s" % config.ALL_DATA_LEMMATIZED_STEMMED)
    pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll)
示例#19
0
def main():
    # load provided data
    dfTrain = pd.read_csv(config.TRAIN_DATA, encoding="ISO-8859-1")
    dfTest = pd.read_csv(config.TEST_DATA, encoding="ISO-8859-1")
    dfAttr = pd.read_csv(config.ATTR_DATA)
    dfDesc = pd.read_csv(config.DESC_DATA)

    # 
    print("Train Mean: %.6f"%np.mean(dfTrain["relevance"]))
    print("Train Var: %.6f"%np.var(dfTrain["relevance"]))

    #
    dfTest["relevance"] = np.zeros((config.TEST_SIZE))
    dfAttr.dropna(how="all", inplace=True)
    dfAttr["value"] = dfAttr["value"].astype(str)

    # concat train and test
    dfAll = pd.concat((dfTrain, dfTest), ignore_index=True)
    del dfTrain
    del dfTest
    gc.collect()

    # merge product description
    dfAll = pd.merge(dfAll, dfDesc, on="product_uid", how="left")
    dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True)
    del dfDesc
    gc.collect()

    # merge product brand
    dfBrand = dfAttr[dfAttr.name=="MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "product_brand"})
    dfAll = pd.merge(dfAll, dfBrand, on="product_uid", how="left")
    dfBrand["product_brand"] = dfBrand["product_brand"].values.astype(str)
    dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True)
    del dfBrand
    gc.collect()

    # merge product color
    color_columns = ["product_color", "Color Family", "Color/Finish", "Color/Finish Family"]
    dfColor = dfAttr[dfAttr.name.isin(color_columns)][["product_uid", "value"]].rename(columns={"value": "product_color"})
    dfColor.dropna(how="all", inplace=True)
    _agg_color = lambda df: " ".join(list(set(df["product_color"])))
    dfColor = dfColor.groupby("product_uid").apply(_agg_color)
    dfColor = dfColor.reset_index(name="product_color")
    dfColor["product_color"] = dfColor["product_color"].values.astype(str)
    dfAll = pd.merge(dfAll, dfColor, on="product_uid", how="left")
    dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True)
    del dfColor
    gc.collect()

    # merge product attribute
    _agg_attr = lambda df: config.ATTR_SEPARATOR.join(df["name"] + config.ATTR_SEPARATOR + df["value"])
    dfAttr = dfAttr.groupby("product_uid").apply(_agg_attr)
    dfAttr = dfAttr.reset_index(name="product_attribute_concat")
    dfAll = pd.merge(dfAll, dfAttr, on="product_uid", how="left")
    dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True)
    del dfAttr
    gc.collect()
    
    # save data
    if config.TASK == "sample":
        dfAll = dfAll.iloc[:config.SAMPLE_SIZE].copy()
    pkl_utils._save(config.ALL_DATA_RAW, dfAll)

    # info
    dfInfo = dfAll[["id","relevance"]].copy()
    pkl_utils._save(config.INFO_DATA, dfInfo)
示例#20
0
def group(input_data, output_data, if_sample=False):
    df = pd.read_csv(input_data,
                     sep="\t",
                     names=["r", "e1", "x1", "y1", "e2", "x2", "y2", "s"])
    grouped = df.groupby(["r", "e1", "e2"])
    words = []
    positions = []
    heads = []
    tails = []
    labels = []
    cnt = 0
    for name, group in grouped:
        if if_sample and cnt > 10000:
            break
        cnt += 1
        if cnt % 1000 == 0:
            print(cnt)
        group = group.reset_index(drop=True)
        label = name[0]
        head = name[1]
        tail = name[2]
        size = group.shape[0]
        tmp_words = []
        tmp_positions = []
        for i in range(size):
            tmp_words.append(group.s[i])
            tmp_positions.append(
                [group.x1[i], group.y1[i], group.x2[i], group.y2[i]])
        if size < config.BAG_SIZE:
            tmp = size
            ans_words = tmp_words[:]
            ans_positions = tmp_positions[:]
            while tmp + size < config.BAG_SIZE:
                tmp += size
                ans_words += tmp_words
                ans_positions += tmp_positions
            ans_words += tmp_words[:config.BAG_SIZE - tmp]
            ans_positions += tmp_positions[:config.BAG_SIZE - tmp]
            words.append(ans_words)
            positions.append(ans_positions)
            heads.append(head)
            tails.append(tail)
            labels.append(label)
        else:
            tmp = 0
            while tmp + config.BAG_SIZE < size:
                words.append(tmp_words[tmp:tmp + config.BAG_SIZE])
                positions.append(tmp_positions[tmp:tmp + config.BAG_SIZE])
                heads.append(head)
                tails.append(tail)
                labels.append(label)
                tmp += config.BAG_SIZE
            words.append(tmp_words[-config.BAG_SIZE:])
            positions.append(tmp_positions[-config.BAG_SIZE:])
            heads.append(head)
            tails.append(tail)
            labels.append(label)
    heads = np.array(heads)
    tails = np.array(tails)
    labels = np.array(labels)
    pkl_utils._save(output_data, (words, positions, heads, tails, labels))
示例#21
0
 def save(self, model_dir, model_name):
     fname = os.path.join(model_dir, model_name)
     self.model.save(fname)
     pkl_utils._save("%s.sent_label" % fname, self.sentences.sent_label)
示例#22
0
def parse(lang="en"):
	dataDict = {}
	
	infile = open(config.INSTANCE_TYPES[lang])
	rdf_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
	type_entries = []
	entitySet = set()
	typeSet = set()
	for line in infile.readlines():
		if line[0] != "<":
			continue
		row = line.split()
		instance = row[0][1:-1]
		ontology = row[2][1:-1]
		type_entries.append((instance, ontology))
		entitySet.add(ontology)
		typeSet.add(ontology)
	typeDict = {y:x for x, y in enumerate(typeSet)}
	infile.close()

	cnt_type = len(entitySet)
	_log.info("%d types" % cnt_type)

	infile = open(config.OBJECTS[lang])
	relationDict = {}
	instanceSet = set()
	for line in infile.readlines():
		if line[0] != "<":
			continue
		row = line.split()
		subject = row[0][1:-1]
		predicate = row[1][1:-1]
		target = row[2][1:-1]
		entitySet.add(subject)
		entitySet.add(target)
		instanceSet.add(subject)
		instanceSet.add(target)
		if predicate in relationDict:
			relationDict[predicate].append((subject, target))
		else:
			relationDict[predicate] = [(subject, target)]
	instanceDict = {y:x for x, y in enumerate(instanceSet)}
	entityDict = {y:x for x, y in enumerate(entitySet)}
	infile.close()

	cnt_ins = len(instanceSet)
	N = len(entitySet)
	_log.info("%d instanes" % cnt_ins)
	_log.info("%d entites" % N)
	
	triples = []
	predicateDict = {}
	cnt = 0
	for predicate in relationDict:
		entries = relationDict[predicate]
		sub = [entityDict[entry[0]] for entry in entries]
		obj = [entityDict[entry[1]] for entry in entries]
		pred = [cnt for entry in entries]
		triples.extend(zip(sub, obj, pred))
		predicateDict[cnt] = predicate
		cnt += 1
	type_entries = [entry for entry in type_entries if entry[0] in instanceSet]
	sub = [entityDict[entry[0]] for entry in type_entries]
	obj = [entityDict[entry[1]] for entry in type_entries]
	pred = [cnt for entry in type_entries]
	triples.extend(zip(sub, obj, pred))
	predicateDict[cnt] = rdf_type
	triples = pd.Series(triples)
	_log.info("%d relations" % (cnt+1))
	_log.info("%d triples" % len(triples))

	dataDict["entities"] = list(entitySet)
	dataDict["relations"] = predicateDict.values()
	IDX = list(range(len(triples)))
	shuffle(IDX)
	dataDict["train_subs"] = list(triples[IDX[:-20000]])
	dataDict["valid_subs"] = list(triples[IDX[-20000:-10000]])
	dataDict["test_subs"] = list(triples[IDX[-10000:]])
	pkl_utils._save(config.DATA_DICT[lang], dataDict)

	_log.info("train size: %d" % len(dataDict["train_subs"]))
	_log.info("valid size: %d" % len(dataDict["valid_subs"]))
	_log.info("test size: %d" % len(dataDict["test_subs"]))

	_log.info("parsing complete")
dfAttr = dfAttr.reset_index(name="product_attribute_concat")
dfAll = pd.merge(dfAll, dfAttr, on="product_uid", how="left")
dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True)
del dfAttr
gc.collect()

# In[17]:

dfAll.head()

# In[ ]:

# save data
if config.TASK == 'sample':
    dfAll = dfAll.iloc[:config.SAMPLE_SIZE].copy(
    )  # in this case ".copy" is redundant
pkl_utils._save(config.ALL_DATA_RAW, dfAll)

# info
dfInfo = dfAll[['id', 'relevance']].copy()
pkl_utils._save(config.INFO_DATA, dfInfo)

# In[ ]:

if os.path.isfile('data_preparer.ipynb'):
    get_ipython().system('jupyter nbconvert --to script data_preparer.ipynb')

# In[ ]:

# In[ ]:
示例#24
0
def main():

    ###########
    ## Setup ##
    ###########
    logname = "data_processor_%s.log"%now
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    # put product_attribute_list, product_attribute and product_description first as they are
    # quite time consuming to process
    columns_to_proc = [
        # # product_attribute_list is very time consuming to process
        # # so we just process product_attribute which is of the form 
        # # attr_name1 | attr_value1 | attr_name2 | attr_value2 | ...
        # # and split it into a list afterwards
        # "product_attribute_list",
        "product_attribute_concat",
        "product_description",
        "product_brand", 
        "product_color",
        "product_title",
        "search_term", 
    ]
    if config.PLATFORM == "Linux":
        config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc)

    # clean using a list of processors
    processors = [
        LowerCaseConverter(), 
        # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here
        UnitConverter(),
        LowerUpperCaseSplitter(), 
        WordReplacer(replace_fname=config.WORD_REPLACER_DATA), 
        LetterLetterSplitter(),
        DigitLetterSplitter(), 
        DigitCommaDigitMerger(), 
        NumberDigitMapper(),
        UnitConverter(), 
        QuartetCleaner(), 
        HtmlCleaner(parser="html.parser"), 
        Lemmatizer(),
    ]
    stemmers = [
        Stemmer(stemmer_type="snowball"), 
        Stemmer(stemmer_type="porter")
    ][0:1]

    ## simple test
    text = "1/2 inch rubber lep tips Bullet07"
    print("Original:")
    print(text)
    list_processor = ListProcessor(processors)
    print("After:")
    print(list_processor.process([text]))

    #############
    ## Process ##
    #############
    ## load raw data
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns]


    ## extract product name from search_term and product_title
    ext = ProductNameExtractor()
    dfAll["search_term_product_name"] = dfAll["search_term"].apply(ext.transform)
    dfAll["product_title_product_name"] = dfAll["product_title"].apply(ext.transform)
    if config.TASK == "sample":
        print(dfAll[["search_term", "search_term_product_name", "product_title_product_name"]])


    ## clean using GoogleQuerySpellingChecker
    # MUST BE IN FRONT OF ALL THE PROCESSING
    if config.GOOGLE_CORRECTING_QUERY:
        logger.info("Run GoogleQuerySpellingChecker at search_term")
        checker = GoogleQuerySpellingChecker()
        dfAll["search_term"] = dfAll["search_term"].apply(checker.correct)


    ## clean uisng a list of processors
    df_processor = DataFrameParallelProcessor(processors, config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # split product_attribute_concat into product_attribute and product_attribute_list
    dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text)
    dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list)
    if config.TASK == "sample":
        print(dfAll[["product_attribute", "product_attribute_list"]])
    # query expansion
    if config.QUERY_EXPANSION:
        list_processor = ListProcessor(processors)
        base_stopwords = set(list_processor.process(list(config.STOP_WORDS)))
        qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords)
        dfAll["search_term_alt"] = qe.build()
        if config.TASK == "sample":
            print(dfAll[["search_term", "search_term_alt"]])
    # save data
    logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED)
    columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"]
    pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save])


    ## auto correcting query
    if config.AUTO_CORRECTING_QUERY:
        logger.info("Run AutoSpellingChecker at search_term")
        checker = AutoSpellingChecker(dfAll, exclude_stopwords=False, min_len=4)
        dfAll["search_term_auto_corrected"] = list(dfAll["search_term"].apply(checker.correct))
        columns_to_proc += ["search_term_auto_corrected"]
        if config.TASK == "sample":
            print(dfAll[["search_term", "search_term_auto_corrected"]])
        # save query_correction_map and spelling checker
        fname = "%s/auto_spelling_checker_query_correction_map_%s.log"%(config.LOG_DIR, now)
        checker.save_query_correction_map(fname)
        # save data
        logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED)
        columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"]
        pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save])


    ## clean using stemmers
    df_processor = DataFrameParallelProcessor(stemmers, config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # split product_attribute_concat into product_attribute and product_attribute_list
    dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text)
    dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list)
    # query expansion
    if config.QUERY_EXPANSION:
        list_processor = ListProcessor(stemmers)
        base_stopwords = set(list_processor.process(list(config.STOP_WORDS)))
        qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords)
        dfAll["search_term_alt"] = qe.build()
        if config.TASK == "sample":
            print(dfAll[["search_term", "search_term_alt"]])
    # save data
    logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED_STEMMED)
    columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"]
    pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll[columns_to_save])
def main():
    ### 1. Record Time
    now = time_utils._timestamp()
    ###########
    ## Setup ##
    ###########
    logname = f'data_processor_{now}.log'
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    # Put product_attribute_list, product_attribute and product_description first as they are
    # quite time consuming to process.
    # Choose the columns by check data_preparer.ipynb. In the end, the notebook will show the clean data frame.
    columns_to_proc = [
        # # product_attribute_list is very time consuming to process
        # # so we just process product_attribute which is of the form
        # # attr_name1 | attr_value1 | attr_name2 | attr_value2 | ...
        # # and split it into a list afterwards
        # 'product_attribute_list',
        'product_attribute_concat',
        'product_description',
        'product_brand',
        'product_color',
        'product_title',
        'search_term',
    ]
    if config.PLATFORM == 'Linux':
        config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc)

    # clean using a list of processors
    processors = [
        LowerCaseConverter(),
        # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here
        # 其實沒差,除非能處理掉數字加介係詞 in 的狀況不被替代成單位 in.(inch)
        UnitConverter(),
        LowerUpperCaseSplitter(),
        WordReplacer(replace_fname=config.WORD_REPLACER_DATA),
        LetterLetterSplitter(),
        DigitLetterSplitter(),
        DigitCommaDigitMerger(),
        NumberDigitMapper(),
        UnitConverter(),
        QuartetCleaner(),
        HtmlCleaner(parser='html.parser'),
        Lemmatizer(),
    ]
    stemmers = [
        Stemmer(stemmer_type='snowball'),
        Stemmer(stemmer_type='porter')
    ][0:1]  # means only use Stemmer(stemmer_type='snowball')

    ## simple test
    text = '1/2 inch rubber lep tips Bullet07'
    print('Original:')
    print(text)
    list_processor = ListProcessor(processors)
    print('After:')
    print(list_processor.process([text]))

    #############
    ## Process ##
    #############
    ## load raw data
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns]

    if config.TASK == 'sample':
        dfAll = dfAll.iloc[0:config.SAMPLE_SIZE]
        print(f'data length: {len(dfAll)}')

    ## extract product name from search_term and product_title
    ext = ProductNameExtractor()
    dfAll['search_term_product_name'] = dfAll['search_term'].apply(
        ext.transform)
    dfAll['product_title_product_name'] = dfAll['product_title'].apply(
        ext.transform)
    if config.TASK == 'sample':
        print(dfAll[[
            'search_term', 'search_term_product_name',
            'product_title_product_name'
        ]])

    ## clean using GoogleQuerySpellingChecker(Chenglong team not used in final submission)
    # MUST BE IN FRONT OF ALL THE PROCESSING
    if config.GOOGLE_CORRECTING_QUERY:
        logger.info('Run GoogleQuerySpellingChecker at search_term')
        checker = GoogleQuerySpellingChecker()
        dfAll['search_term'] = dfAll['search_term'].apply(checker.correct)

    ## clean uisng a list of processors
    df_processor = DataFrameParallelProcessor(processors,
                                              config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # split product_attribute_concat into product_attribute and product_attribute_list
    dfAll['product_attribute'] = dfAll['product_attribute_concat'].apply(
        _split_attr_to_text)
    dfAll['product_attribute_list'] = dfAll['product_attribute_concat'].apply(
        _split_attr_to_list)
    if config.TASK == 'sample':
        print(dfAll[['product_attribute', 'product_attribute_list']])

    # query expansion (Chenglong team decided to remove the feature which might be a major cause of overfitting.)
    if config.QUERY_EXPANSION:
        list_processor = ListProcessor(processors)
        # stop words must to access data process. EX. NumberDigitMapper function will replace 'one' to '1'.
        # So, if stop word has 'one', it must replace to '1',too.
        base_stopwords = set(list_processor.process(list(
            config.STOP_WORDS)))  # a set of stop word
        qe = QueryExpansion(dfAll,
                            ngram=3,
                            stopwords_threshold=0.9,
                            base_stopwords=base_stopwords)
        dfAll['search_term_alt'] = qe.build()
        if config.TASK == 'sample':
            print(dfAll[['search_term', 'search_term_alt']])

    # save data
    logger.info(f'Save to {config.ALL_DATA_LEMMATIZED}')
    columns_to_save = [
        col for col in dfAll.columns if col != 'product_attribute_concat'
    ]
    pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save])

    ## auto correcting query(Chenglong team not used in final submission)
    if config.AUTO_CORRECTING_QUERY:
        logger.info('Run AutoSpellingChecker at search_term')
        checker = AutoSpellingChecker(dfAll,
                                      exclude_stopwords=False,
                                      min_len=4)
        dfAll['search_term_auto_corrected'] = list(dfAll['search_term'].apply(
            checker.correct))
        columns_to_proc += ['search_term_auto_corrected']
        if config.TASK == 'sample':
            print(dfAll[['search_term', 'search_term_auto_corrected']])
        # save query_correction_map and spelling checker
        fname = '%s/auto_spelling_checker_query_correction_map_%s.log' % (
            config.LOG_DIR, now)
        checker.save_query_correction_map(fname)
        # save data
        logger.info('Save to %s' % config.ALL_DATA_LEMMATIZED)
        columns_to_save = [
            col for col in dfAll.columns if col != 'product_attribute_concat'
        ]
        pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save])

    ## clean using stemmers
    df_processor = DataFrameParallelProcessor(stemmers,
                                              config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # split product_attribute_concat into product_attribute and product_attribute_list
    dfAll['product_attribute'] = dfAll['product_attribute_concat'].apply(
        _split_attr_to_text)
    dfAll['product_attribute_list'] = dfAll['product_attribute_concat'].apply(
        _split_attr_to_list)

    # query expansion
    if config.QUERY_EXPANSION:
        list_processor = ListProcessor(stemmers)
        base_stopwords = set(list_processor.process(list(config.STOP_WORDS)))
        qe = QueryExpansion(dfAll,
                            ngram=3,
                            stopwords_threshold=0.9,
                            base_stopwords=base_stopwords)
        dfAll['search_term_alt'] = qe.build()
        if config.TASK == 'sample':
            print(dfAll[['search_term', 'search_term_alt']])

    # save data
    logger.info('Save to %s' % config.ALL_DATA_LEMMATIZED_STEMMED)
    columns_to_save = [
        col for col in dfAll.columns if col != 'product_attribute_concat'
    ]
    pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll[columns_to_save])
 def save(self, model_dir, model_name):
     fname = os.path.join(model_dir, model_name)
     self.model.save(fname)
     pkl_utils._save("%s.sent_label"%fname, self.sentences.sent_label)
示例#27
0
def main():

    ###########
    ## Setup ##
    ###########
    logname = "data_processor_%s.log"%now
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    # put product_attribute_list, product_attribute and product_description first as they are
    # quite time consuming to process
    columns_to_proc = [
        # # product_attribute_list is very time consuming to process
        # # so we just process product_attribute which is of the form 
        # # attr_name1 | attr_value1 | attr_name2 | attr_value2 | ...
        # # and split it into a list afterwards
        # "product_attribute_list",
        "product_attribute_concat",
        "product_description",
        "product_brand", 
        "product_color",
        "product_title",
        "search_term", 
    ]
    if config.PLATFORM == "Linux":
        config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc)

    # clean using a list of processors
    processors = [
        LowerCaseConverter(), 
        # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here
        UnitConverter(),
        LowerUpperCaseSplitter(), 
        WordReplacer(replace_fname=config.WORD_REPLACER_DATA), 
        LetterLetterSplitter(),
        DigitLetterSplitter(), 
        DigitCommaDigitMerger(), 
        NumberDigitMapper(),
        UnitConverter(), 
        QuartetCleaner(), 
        HtmlCleaner(parser="html.parser"), 
        Lemmatizer(),
    ]
    stemmers = [
        Stemmer(stemmer_type="snowball"), 
        Stemmer(stemmer_type="porter")
    ][0:1]

    ## simple test
    text = "1/2 inch rubber lep tips Bullet07"
    print("Original:")
    print(text)
    list_processor = ListProcessor(processors)
    print("After:")
    print(list_processor.process([text]))

    #############
    ## Process ##
    #############
    ## load raw data
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns]


    ## extract product name from search_term and product_title
    ext = ProductNameExtractor()
    dfAll["search_term_product_name"] = dfAll["search_term"].apply(ext.transform)
    dfAll["product_title_product_name"] = dfAll["product_title"].apply(ext.transform)
    if config.TASK == "sample":
        print(dfAll[["search_term", "search_term_product_name", "product_title_product_name"]])


    ## clean using GoogleQuerySpellingChecker
    # MUST BE IN FRONT OF ALL THE PROCESSING
    logger.info("Run GoogleQuerySpellingChecker at search_term")
    checker = GoogleQuerySpellingChecker()
    dfAll["search_term"] = dfAll["search_term"].apply(checker.correct)


    ## clean uisng a list of processors
    df_processor = DataFrameParallelProcessor(processors, config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # split product_attribute_concat into product_attribute and product_attribute_list
    dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text)
    dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list)
    if config.TASK == "sample":
        print(dfAll[["product_attribute", "product_attribute_list"]])
    # query expansion
    list_processor = ListProcessor(processors)
    base_stopwords = set(list_processor.process(list(config.STOP_WORDS)))
    qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords)
    dfAll["search_term_alt"] = qe.build()
    if config.TASK == "sample":
        print(dfAll[["search_term", "search_term_alt"]])
    # save data
    logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED)
    columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"]
    pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save])


    ## auto correcting query
    if config.AUTO_CORRECTING_QUERY:
        logger.info("Run AutoSpellingChecker at search_term")
        checker = AutoSpellingChecker(dfAll, exclude_stopwords=False, min_len=4)
        dfAll['search_term_auto_corrected'] = list(dfAll["search_term"].apply(checker.correct))
        columns_to_proc += ['search_term_auto_corrected']
        if config.TASK == "sample":
            print(dfAll[["search_term", "search_term_auto_corrected"]])
        # save query_correction_map and spelling checker
        fname = "%s/auto_spelling_checker_query_correction_map_%s.log"%(config.LOG_DIR, now)
        checker.save_query_correction_map(fname)
        # save data
        logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED)
        columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"]
        pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save])


    ## clean using stemmers
    df_processor = DataFrameParallelProcessor(stemmers, config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # split product_attribute_concat into product_attribute and product_attribute_list
    dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text)
    dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list)
    # query expansion
    list_processor = ListProcessor(stemmers)
    base_stopwords = set(list_processor.process(list(config.STOP_WORDS)))
    qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords)
    dfAll["search_term_alt"] = qe.build()
    if config.TASK == "sample":
        print(dfAll[["search_term", "search_term_alt"]])
    # save data
    logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED_STEMMED)
    columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"]
    pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll[columns_to_save])
示例#28
0
 def save(self, fname):
     pkl_utils._save(fname, self.splits)
示例#29
0
 def save(self, fname):
     pkl_utils._save(fname, self.splits)
示例#30
0
def parse(lang="en"):
	infile = open(config.INSTANCE_TYPES[lang])
	rdf_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
	type_entries = []
	entitySet = set()
	typeSet = set()
	for line in infile.readlines():
		if line[0] != "<":
			continue
		row = line.split()
		instance = row[0][1:-1]
		ontology = row[2][1:-1]
		type_entries.append((instance, ontology))
		entitySet.add(ontology)
		typeSet.add(ontology)
	typeDict = {y:x for x, y in enumerate(typeSet)}
	infile.close()

	cnt_type = len(entitySet)
	_log.info("%d types" % cnt_type)

	infile = open(config.OBJECTS[lang])
	relationDict = {}
	instanceSet = set()
	for line in infile.readlines():
		if line[0] != "<":
			continue
		row = line.split()
		subject = row[0][1:-1]
		predicate = row[1][1:-1]
		target = row[2][1:-1]
		entitySet.add(subject)
		entitySet.add(target)
		instanceSet.add(subject)
		instanceSet.add(target)
		if predicate in relationDict:
			relationDict[predicate].append((subject, target))
		else:
			relationDict[predicate] = [(subject, target)]
	instanceDict = {y:x for x, y in enumerate(instanceSet)}
	entityDict = {y:x for x, y in enumerate(entitySet)}
	infile.close()

	cnt_ins = len(instanceSet)
	N = len(entitySet)
	_log.info("%d instanes" % cnt_ins)
	_log.info("%d entites" % N)
	
	tensor = []
	predicateDict = {}
	cnt = 0
	for predicate in relationDict:
		entries = relationDict[predicate]
		rows = [entityDict[entry[0]] for entry in entries]
		cols = [entityDict[entry[1]] for entry in entries]
		data = [1 for entry in entries]
		mat = spsp.csr_matrix((data, (rows, cols)), (N, N))
		tensor.append(mat)
		predicateDict[cnt] = predicate
		cnt += 1
	type_entries = [entry for entry in type_entries if entry[0] in instanceSet]
	rows = [entityDict[entry[0]] for entry in type_entries]
	cols = [entityDict[entry[1]] for entry in type_entries]
	data = [1 for entry in type_entries]
	mat = spsp.csr_matrix((data, (rows, cols)), (N, N))
	tensor.append(mat)
	predicateDict[cnt] = rdf_type
	_log.info("%d relations" % (cnt+1))
	pkl_utils._save(config.TENSOR[lang], tensor)
	pkl_utils._save(config.ENTITY[lang], entityDict)
	pkl_utils._save(config.PREDICATE[lang], predicateDict)
	pkl_utils._save(config.INSTANCE[lang], instanceDict)
	pkl_utils._save(config.TYPE[lang], typeDict)
	pkl_utils._save(config.TYPE_MATRIX[lang], (rows, cols))
	_log.info("parsing complete")