Python TextFileOperator示例，file.file_operator.TextFileOperator Python示例

示例#1

0

显示文件

文件： run.py 项目： persistforever/KnowledgeableSVM

    def run_create_sentences(self, article_path, participle_title_path, sentences_path) :
        articles = self.read_article(article_path)
        titles = self.read_participle_title(participle_title_path)

        # remove duplications
        processor = Unique()
        indexs_unique = [titles[index]['id'] for index in processor.unique( \
            [article['participle_title'] for article in titles])]
        indexs_dict = dict().fromkeys(set(indexs_unique))
        remained_articles = [article for article in articles if article['id'] in indexs_dict]
        print 'remove duplications finished ...'

        # create sentences
        segmentor = ContentSegementor()
        sentences = list()
        length = len(remained_articles) - 1
        for idx, article in enumerate(remained_articles) :
            segmented_content = segmentor.segement(article['content'])
            sentences.extend([[sentence] for sentence in segmented_content])
            if idx % 100 == 0 :
                print 'finish rate is %.2f%%\r' % (100.0*idx/length),
        print 'finish rate is %.2f%%\r' % (100.0*idx/length)
        
        file_operator = TextFileOperator()
        file_operator.writing(sentences, sentences_path)
        print 'writing sentences finished ...'

示例#2

0

显示文件

文件： run.py 项目： persistforever/KnowledgeableSVM

 def run_feature_select(
     self, article_market_path, pos_path, punc_path, klword_path, feature_path, feature_market_path
 ):
     loader = PickleMarket()
     pos_selector = selector.PosExtractor(pos_path, w=15, combined=False)
     token_selector = selector.TokenExtractor(punc_path)
     word_selector = selector.WordExtractor(klword_path, weight=1)
     articles = loader.load_market(article_market_path)
     length = len(articles) - 1
     for idx, article in enumerate(articles):
         article["features"] = list()
         article["features"].extend(pos_selector.extract_feature_windows(article["participle_content"]))
         article["features"].extend(
             token_selector.extract_feature(
                 article["title"], article["content"], article["participle_title"], article["participle_content"]
             )
         )
         article["features"].extend(
             word_selector.extract_feature(article["participle_title"], article["participle_content"])
         )
         print "finish rate is %.2f%%\r" % (100.0 * idx / length),
     print "finish rate is %.2f%%\r" % (100.0 * idx / length)
     featuresets = [pos_selector.names + token_selector.names + word_selector.names]
     featuresets.extend([[article["id"]] + article["features"] + [article["label"]] for article in articles])
     file_operator = TextFileOperator()
     file_operator.writing(featuresets, feature_path)
     loader.dump_market(featuresets, feature_market_path)
     print "finish"

示例#3

0

显示文件

文件： run.py 项目： persistforever/KnowledgableArticle

 def read_article(self, article_path) :
     """ Read source article.
         Each row is an article.
         Colunm[0] is the id of article.
         Column[1:] is the attributes of article.
     """
     file_operator = TextFileOperator()
     data_list = file_operator.reading(article_path)
     entry_list = data_list[0]
     source_list = []
     length = len(data_list[1:]) - 1
     for idx, data in enumerate(data_list[1:]) :
         if len(data) >= len(entry_list) :
             article = dict()
             article['id'] = data[0]
             article['url'] = data[1]
             article['title'] = data[2]
             article['content'] = data[3]
             article['participle_title'] = [Word(word) for word in data[4].split(' ')]
             article['participle_content'] = [Word(word) for word in data[5].split(' ')]
             article['label'] = data[6]
             source_list.append(article)
         if idx % 100 == 0 :
             print 'finish rate is %.2f%%\r' % (100.0*idx/length),
     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     return source_list

示例#4

0

显示文件

文件： run.py 项目： persistforever/KnowledgeableSVM

 def run_convert_sentences(self, sentences_path, sentences_market_path) :
     file_operator = TextFileOperator()
     sentences = file_operator.reading(sentences_path)
     sentences = [[word.split('<:>')[0] for word in sentence] for sentence in sentences]
     loader = PickleMarket()
     loader.dump_market(sentences, sentences_market_path)
     print 'converting sentences finished ...'

示例#5

0

显示文件

文件： run.py 项目： persistforever/KnowledgableArticle

 def read_participle(self, articles, participle_path) :
     """ Read participle title.
         Each row is an article.
         Colunm[0] is the id of article.
         Column[1] is the word of participle title.
         Column[2] is the word of participle content.
     """
     file_operator = TextFileOperator()
     data_list = file_operator.reading(participle_path)
     entry_list = data_list[0]
     article_dict = dict()
     length = len(data_list[1:]) - 1
     for idx, data in enumerate(data_list[1:]) :
         if len(data) >= len(entry_list) :
             article = dict()
             article['id'] = data[0]
             article['participle_title'] = [Word(word) for word in data[1].split(' ')]
             article['participle_content'] = [Word(word) for word in data[2].split(' ')]
             article_dict[article['id']] = article
         if idx % 100 == 0 :
             print 'finish rate is %.2f%%\r' % (100.0*idx/length),
     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     length = len(articles) - 1
     for idx, article in enumerate(articles) :
         if article['id'] in article_dict :
             article['participle_title'] = article_dict[article['id']]['participle_title']
             article['participle_content'] = article_dict[article['id']]['participle_content']
         if idx % 100 == 0 :
             print 'finish rate is %.2f%%\r' % (100.0*idx/length),
     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     return articles

示例#6

0

显示文件

文件： run.py 项目： persistforever/KnowledgeableSVM

 def run_classify(self, train_path, test_path, train_set, test_set, output_path) :
     loader = PickleMarket()
     # read train
     feature_names = loader.load_market(train_path)[0]
     train_articles = list()
     for type in train_set.split('#') :
         path = train_path.replace(u'car', type)
         train_articles.extend(loader.load_market(path)[1:])
     train_dataset = np.array([np.array(article[1:30000], dtype=float) for article in train_articles])
     print train_dataset.shape
     train_label = np.array([np.array(int(article[-1])) for article in train_articles])
     # read test
     test_articles = list()
     for type in test_set.split('#') :
         path = test_path.replace(u'car', type)
         test_articles.extend(loader.load_market(path)[1:])
     test_dataset = np.array([np.array(article[1:30000]) for article in test_articles])
     print test_dataset.shape
     test_label = np.array([np.array(int(article[-1])) for article in test_articles])
     # train cls
     classifier = LrClassifier()
     train_dataset = classifier.normalize(train_dataset, method='mapminmax')
     test_dataset = classifier.normalize(test_dataset, method='mapminmax')
     classifier.training(train_dataset, train_label, c=10, kernel='linear')
     # test cls
     test_prob = classifier.testing(test_dataset, type='prob')
     test_class = classifier.testing(test_dataset, type='label')
     evls, fprs, tprs = classifier.evaluation(test_label, test_prob, test_class)
     print 'performance is', evls
     ftprs = [[fpr, tprs[idx]] for idx, fpr in enumerate(fprs)]
     file_operator = TextFileOperator()
     file_operator.writing(ftprs, output_path)
     print 'finish'

示例#7

0

显示文件

文件： run.py 项目： persistforever/KnowledgeableSVM

 def read_sentences(self, source_path, type='all') :
     """ Read participle sentences.
         Each row is a sentence.
     """
     file_operator = TextFileOperator()
     data_list = file_operator.reading(source_path)
     entry_list = data_list[0]
     sentences = list()
     length = len(data_list[1:]) - 1
     if type == 'all' :
         for idx, data in enumerate(data_list[1:]) :
             if len(data) >= len(entry_list) :
                 sentence = [Word(word, sp_char=':').to_string() for word in data[0].split(' ')]
                 sentences.append(sentence)
             if idx % 100 == 0 :
                 print 'finish rate is %.2f%%\r' % (100.0*idx/length),
         print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     elif type == 'name' :
         for idx, data in enumerate(data_list[1:]) :
             if len(data) >= len(entry_list) :
                 sentence = [Word(word, sp_char=':').name for word in data[0].split(' ')]
                 sentences.append(sentence)
             if idx % 100 == 0 :
                 print 'finish rate is %.2f%%\r' % (100.0*idx/length),
         print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     return sentences

示例#8

0

显示文件

文件： run.py 项目： persistforever/KnowledgeableSVM

 def read_article(self, article_path) :
     """ Read source article.
         Each row is an article.
         Colunm[0] is the id of article.
         Column[1:] is the attributes of article.
     """
     file_operator = TextFileOperator()
     data_list = file_operator.reading(article_path)
     entry_list = data_list[0]
     source_list = []
     length = len(data_list[1:]) - 1
     for idx, data in enumerate(data_list[1:]) :
         if len(data) >= len(entry_list) :
             article = dict()
             article['id'] = data[0]
             article['url'] = data[1]
             article['pub_time'] = data[2]
             article['title'] = data[3]
             article['content'] = data[4]
             article['n_zan'] = data[5]
             article['n_forward'] = data[6]
             article['n_click'] = data[7]
             article['n_collect'] = data[8]
             article['read_time'] = data[9]
             article['finish_rate'] = data[10]
             source_list.append(article)
         if idx % 100 == 0 :
             print 'finish rate is %.2f%%\r' % (100.0*idx/length),
     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     return source_list

示例#9

0

显示文件

文件： selector.py 项目： persistforever/KnowledgableArticle

 def _read_dictionary(self, pos_path) :
     file_operator = TextFileOperator()
     data_list = file_operator.reading(pos_path)
     dictionary = dict()
     for data in data_list :
         if len(data) >= 1 :
             dictionary[data[0]] = 0
     return dictionary

示例#10

0

显示文件

文件： test.py 项目： persistforever/KnowledgeableSVM

 def run_tag_sentences(self, tag_tree_path, sentences_market_path, tags_path, untag_sentence_path) :
     file_operator = TextFileOperator()
     sentences = [u'技能贴 | 黑色打底裤的10种正确穿搭方式', u'春季男鞋韩版潮流行英伦男士休闲鞋']
     cmd_list = file_operator.reading(tag_tree_path)
     tag_tree = TagTree(cmd_list)
     robot = Robot()
     tags, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:1000])
     print 'finish'

示例#11

0

显示文件

文件： run.py 项目： persistforever/KnowledgeableSVM

 def run_robot(self, tag_tree_path, sentences_market_path, tags_path) :
     robot = Robot()
     loader = PickleMarket()
     file_operator = TextFileOperator()
     cmd_list = file_operator.reading(tag_tree_path)
     tag_tree = TagTree(cmd_list)
     sentences = loader.load_market(sentences_market_path)
     tags = loader.load_market(tags_path)
     print 'start'
     string = raw_input().decode('gb18030')
     # string = u'我想要毛衣'
     sentences = robot.question_and_answer(string, sentences, tags, tag_tree)

示例#12

0

显示文件

文件： run.py 项目： persistforever/KnowledgeableSVM

 def run_create_word2vec(self, sentences_path, word_embedding_path, word_embedding_market_path) :
     loader = PickleMarket()
     sentences = list()
     for type in [u'_car']:#, u'_finance', u'_web'] :
         sentences.extend(loader.load_market(sentences_path + type))
     print 'import finish ...'
     embeddor = WordEmbed()
     print sentences[0]
     model = embeddor.word_to_vector(type='create', sentences=sentences[0:100], path=word_embedding_market_path)
     data_list = embeddor.get_word2vec_model(model)
     file_operator = TextFileOperator()
     file_operator.writing(data_list, word_embedding_path)
     print 'create word2vec finished ...'

示例#13

0

显示文件

文件： run.py 项目： persistforever/KnowledgeableSVM

 def run_tag_sentences(self, tag_tree_path, sentences_market_path, tags_martket_path, dict_market_path) :
     file_operator = TextFileOperator()
     loader = PickleMarket()
     sentences = loader.load_market(sentences_market_path)
     cmd_list = file_operator.reading(tag_tree_path)
     tag_tree = TagTree(cmd_list)
     robot = Robot()
     tags, tags_show, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:])
     loader = JsonMarket()
     loader.dump_market(tags, tags_martket_path)
     loader.dump_market(tag_tree.dict_tuple, dict_market_path)
     print '%.2f%% article >= 1 tags, number is, %d.' \
         % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences)) \
         % len([tag for tag in tags_show if len(tag) >= 1])

示例#14

0

显示文件

文件： run.py 项目： persistforever/KnowledgeableSVM

 def run_test(self, tag_tree_path, sentences_market_path, tags_path, \
     tags_martket_path, untag_sentence_path) :
     file_operator = TextFileOperator()
     loader = PickleMarket()
     sentences = loader.load_market(sentences_market_path)
     cmd_list = file_operator.reading(tag_tree_path)
     tag_tree = TagTree(cmd_list)
     robot = Robot()
     tags, tags_show, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:])
     loader = JsonMarket()
     self.write_tags(sentences, tags_show, tags_path)
     loader.dump_market(tags, tags_martket_path)
     file_operator.writing(untag_sentences, untag_sentence_path)
     # loader.dump_market(untag_sentences, sentences_market_path)
     # print '%.2f%% article >= 2 tags' % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences))
     print '%.2f%% article >= 3 tags' % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences))

示例#15

0

显示文件

文件： test.py 项目： persistforever/KnowledgeableSVM

 def read_sentences(self, sentences_path) :
     """ Read participle sentences.
         Each row is a sentence.
     """
     file_operator = TextFileOperator()
     data_list = file_operator.reading(sentences_path)
     entry_list = data_list[0]
     sentences = list()
     length = len(data_list[1:]) - 1
     for idx, data in enumerate(data_list[1:]) :
         if len(data) >= len(entry_list) :
             sentence = data[0].upper()
             sentences.append(sentence)
         if idx % 100 == 0 :
             print 'finish rate is %.2f%%\r' % (100.0*idx/length),
     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     return sentences

示例#16

0

显示文件

文件： segementor.py 项目： persistforever/KnowledgeableSVM

 def _read_dictionary(self, split_path) :
     self.file_operator = TextFileOperator()
     data_list = self.file_operator.reading(split_path)
     split_dict = dict()
     for data in data_list :
         if len(data) >= 1 :
             if data[0] not in split_dict :
                 split_dict[data[0]] = None
     return split_dict

示例#17

0

显示文件

文件： run.py 项目： persistforever/KnowledgeableSVM

 def write_article(self, articles, article_path) :
     """ Write source article.
         Each row is an article.
         Colunm[0] is the id of article.
         Column[1:] is the attributes of article.
     """
     data_list = list()
     entry_list = ['id', 'url', 'title', 'content']
     data_list.append(entry_list)
     length = len(articles) - 1
     for idx, article in enumerate(articles) :
         data = [article['id'], article['url'], article['title'], article['content']]
         data_list.append(data)
         if idx % 100 == 0 :
             print 'finish rate is %.2f%%\r' % (100.0*idx/length),
     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     file_operator = TextFileOperator()
     file_operator.writing(data_list, article_path)

示例#18

0

显示文件

文件： run.py 项目： persistforever/KnowledgeableSVM

 def read_participle_title(self, title_path) :
     """ Read participle title.
         Each row is an article.
         Colunm[0] is the id of article.
         Column[1:] is the word of participle title.
     """
     file_operator = TextFileOperator()
     data_list = file_operator.reading(title_path)
     entry_list = data_list[0]
     source_list = list()
     length = len(data_list[1:]) - 1
     for idx, data in enumerate(data_list[1:]) :
         if len(data) >= len(entry_list) :
             article = dict()
             article['id'] = data[0]
             article['participle_title'] = [Word(word) for word in data[1].split(' ')]
             source_list.append(article)
         if idx % 100 == 0 :
             print 'finish rate is %.2f%%\r' % (100.0*idx/length),
     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     return source_list

示例#19

0

显示文件

文件： run.py 项目： persistforever/KnowledgeableSVM

 def write_tags(self, sentences, tags, tags_path) :
     """ Read participle sentences.
         Each row is a sentence.
         Each column is a <attribute, value> pair.
     """
     file_operator = TextFileOperator()
     data_list = list()
     data_list.append(['sentence', 'tag'])
     length = len(tags) - 1
     for idx, term in enumerate(tags) :
         if len(term) >= 2 :
             data = list()
             data.append(sentences[idx][1])
             tag_str = ''
             for attr, value in term :
                 tag_str += u'<' + attr + u',' + value + u'>' + ' '
             data.append(tag_str)
             data_list.append(data)
         if idx % 100 == 0 :
             print 'finish rate is %.2f%%\r' % (100.0*idx/length),
     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     file_operator.writing(data_list, tags_path)

示例#20

0

显示文件

文件： run.py 项目： persistforever/KnowledgableArticle

 def run_feature_select(self, article_market_path, pos_path, punc_path, klword_path, \
     feature_path, feature_market_path) :
     loader = PickleMarket()
     pos_selector = selector.PosExtractor(pos_path, w=5, combined=True)
     token_selector = selector.TokenExtractor(punc_path)
     word_selector = selector.WordExtractor(klword_path, weight=1)
     articles = loader.load_market(article_market_path)
     length = len(articles) - 1
     for idx, article in enumerate(articles) :
         article['features'] = list()
         article['features'].extend(pos_selector.extract_feature(article['participle_content']))
         article['features'].extend(token_selector.extract_feature(article['title'], article['content'], \
                                                                     article['participle_title'], \
                                                                     article['participle_content']))
         article['features'].extend(word_selector.extract_feature(article['participle_title'], \
                                                                     article['participle_content']))
         print 'finish rate is %.2f%%\r' % (100.0*idx/length),
     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     featuresets = [[article['id']] + article['features'] + [article['label']] for article in articles]
     file_operator = TextFileOperator()
     file_operator.writing(featuresets, feature_path)
     loader.dump_market(featuresets, feature_market_path)
     print 'finish'

示例#21

0

显示文件

文件： run.py 项目： persistforever/KnowledgeableSVM

 def run_optimize_params(
     self, train_article_market_path, test_article_market_path, pos_path, punc_path, klword_path, logger_path
 ):
     loader = PickleMarket()
     logger = list()
     logger.append(
         [
             "w",
             "combined",
             "weight",
             "kernel",
             "c",
             "norm",
             "car_car",
             "car_finance",
             "car_web",
             "finance_car",
             "finance_finance",
             "finance_web",
             "web_car",
             "web_fiannce",
             "web_web",
             "merge_car",
             "merge_finance",
             "merge_web",
         ]
     )
     domains = [u"car", u"finance", u"web"]
     wset = [5, 10, 15, 20]
     combinedset = [True, False]
     weightset = [1, 2, 5]
     kernelset = ["linear", "poly", "rbf"]
     cset = [range(10, 100, 10), range(100, 1000, 100)]
     normset = ["mapminmax", "zscore"]
     token_selector = selector.TokenExtractor(punc_path)
     for w in wset:
         for combined in combinedset:
             pos_selector = selector.PosExtractor(pos_path, w=w, combined=combined)
             for weight in weightset:
                 word_selector = selector.WordExtractor(klword_path, weight=weight)
                 train_featuresets, test_featuresets = list(), list()
                 for step in range(len(domains)):
                     train_featuresets.append(list())
                     test_featuresets.append(list())
                 for index, domain in enumerate(domains):
                     train_articles = loader.load_market(train_article_market_path.replace(u"all", domain))
                     test_articles = loader.load_market(test_article_market_path.replace(u"all", domain))
                     # train
                     length = len(train_articles) - 1
                     for idx, article in enumerate(train_articles):
                         article["features"] = list()
                         article["features"].extend(pos_selector.extract_feature(article["participle_content"]))
                         article["features"].extend(
                             token_selector.extract_feature(
                                 article["title"],
                                 article["content"],
                                 article["participle_title"],
                                 article["participle_content"],
                             )
                         )
                         article["features"].extend(
                             word_selector.extract_feature(
                                 article["participle_title"], article["participle_content"]
                             )
                         )
                         print "finish rate is %.2f%%\r" % (100.0 * idx / length),
                     print "finish rate is %.2f%%\r" % (100.0 * idx / length)
                     train_featuresets[index] = [
                         [article["id"]] + article["features"] + [article["label"]] for article in train_articles
                     ]
                     # test
                     length = len(test_articles) - 1
                     for idx, article in enumerate(test_articles):
                         article["features"] = list()
                         article["features"].extend(pos_selector.extract_feature(article["participle_content"]))
                         article["features"].extend(
                             token_selector.extract_feature(
                                 article["title"],
                                 article["content"],
                                 article["participle_title"],
                                 article["participle_content"],
                             )
                         )
                         article["features"].extend(
                             word_selector.extract_feature(
                                 article["participle_title"], article["participle_content"]
                             )
                         )
                         print "finish rate is %.2f%%\r" % (100.0 * idx / length),
                     print "finish rate is %.2f%%\r" % (100.0 * idx / length)
                     test_featuresets[index] = [
                         [article["id"]] + article["features"] + [article["label"]] for article in test_articles
                     ]
                 for kernel in kernelset:
                     for c in cset:
                         for norm in normset:
                             evl = list()
                             for train_idx in range(0, len(domains)):
                                 for test_idx in range(0, len(domains)):
                                     train_dataset = np.array(
                                         [np.array(article[1:-1]) for article in train_featuresets[train_idx]]
                                     )
                                     train_label = np.array(
                                         [np.array(int(article[-1])) for article in train_featuresets[train_idx]]
                                     )
                                     test_dataset = np.array(
                                         [np.array(article[1:-1]) for article in test_featuresets[test_idx]]
                                     )
                                     test_label = np.array(
                                         [np.array(int(article[-1])) for article in test_featuresets[test_idx]]
                                     )
                                     classifier = SvmClassifier()
                                     train_dataset = classifier.normalize(train_dataset, method=norm)
                                     test_dataset = classifier.normalize(test_dataset, method=norm)
                                     classifier.training(train_dataset, train_label, cset=c, kernel=kernel)
                                     test_prob = classifier.testing(test_dataset, type="prob")
                                     test_class = classifier.testing(test_dataset, type="label")
                                     evl.append(classifier.evaluation(test_label, test_prob, test_class)[1])
                             print "single finished ..."
                             # merge
                             articles = list()
                             for train_idx in range(0, len(domains)):
                                 articles.extend(train_featuresets[train_idx])
                             train_dataset = np.array([np.array(article[1:-1]) for article in articles])
                             train_label = np.array([np.array(int(article[-1])) for article in articles])
                             for test_idx in range(0, len(domains)):
                                 test_dataset = np.array(
                                     [np.array(article[1:-1]) for article in test_featuresets[test_idx]]
                                 )
                                 test_label = np.array(
                                     [np.array(int(article[-1])) for article in test_featuresets[test_idx]]
                                 )
                                 classifier = SvmClassifier()
                                 train_dataset = classifier.normalize(train_dataset, method=norm)
                                 test_dataset = classifier.normalize(test_dataset, method=norm)
                                 classifier.training(train_dataset, train_label, cset=c, kernel=kernel)
                                 test_prob = classifier.testing(test_dataset, type="prob")
                                 test_class = classifier.testing(test_dataset, type="label")
                                 evl.append(classifier.evaluation(test_label, test_prob, test_class)[1])
                             print "merge finished ..."
                             print "performance is", 1.0 * sum(evl) / len(evl)
                             log = [w, combined, weight, kernel, c[0], norm]
                             log.extend(evl)
                             logger.append(log)
     file_operator = TextFileOperator()
     file_operator.writing(logger, logger_path)
     print "finish"

示例#22

0

显示文件

文件： run.py 项目： persistforever/KnowledgableArticle

 def run_optimize_params(self, train_article_market_path, test_article_market_path, \
     pos_path, punc_path, klword_path, logger_path) :
     loader = PickleMarket()
     logger = list()
     logger.append(['w', 'combined', 'weight', 'kernel', 'c', 'norm', 'car_car', \
         'car_finance', 'car_web', 'finance_car', 'finance_finance', 'finance_web', \
         'web_car', 'web_fiannce', 'web_web', 'merge_car', 'merge_finance', 'merge_web'])
     domains = [u'car', u'finance', u'web']
     wset = [5, 10, 15, 20]
     combinedset = [True, False]
     weightset = [1, 2, 5]
     kernelset = ['linear', 'poly', 'rbf']
     cset = [range(10, 100, 10), range(100, 1000, 100)]
     normset = ['mapminmax', 'zscore']
     token_selector = selector.TokenExtractor(punc_path)
     for w in wset :
         for combined in combinedset :
             pos_selector = selector.PosExtractor(pos_path, w=w, combined=combined)
             for weight in weightset :
                 word_selector = selector.WordExtractor(klword_path, weight=weight)
                 train_featuresets, test_featuresets = list(), list()
                 for step in range(len(domains)) :
                     train_featuresets.append(list())
                     test_featuresets.append(list())
                 for index, domain in enumerate(domains) :
                     train_articles = loader.load_market(train_article_market_path.replace(u'all', domain))
                     test_articles = loader.load_market(test_article_market_path.replace(u'all', domain))
                     # train
                     length = len(train_articles) - 1
                     for idx, article in enumerate(train_articles) :
                         article['features'] = list()
                         article['features'].extend(pos_selector.extract_feature(article['participle_content']))
                         article['features'].extend(token_selector.extract_feature(article['title'], article['content'], \
                                                                                     article['participle_title'], \
                                                                                     article['participle_content']))
                         article['features'].extend(word_selector.extract_feature(article['participle_title'], \
                                                                                     article['participle_content']))
                         print 'finish rate is %.2f%%\r' % (100.0*idx/length),
                     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
                     train_featuresets[index] = [[article['id']] + article['features'] + [article['label']] for article in train_articles]
                     # test
                     length = len(test_articles) - 1
                     for idx, article in enumerate(test_articles) :
                         article['features'] = list()
                         article['features'].extend(pos_selector.extract_feature(article['participle_content']))
                         article['features'].extend(token_selector.extract_feature(article['title'], article['content'], \
                                                                                     article['participle_title'], \
                                                                                     article['participle_content']))
                         article['features'].extend(word_selector.extract_feature(article['participle_title'], \
                                                                                     article['participle_content']))
                         print 'finish rate is %.2f%%\r' % (100.0*idx/length),
                     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
                     test_featuresets[index] = [[article['id']] + article['features'] + [article['label']] for article in test_articles]
                 for kernel in kernelset :
                     for c in cset :
                         for norm in normset :
                             evl = list()
                             for train_idx in range(0, len(domains)) :
                                 for test_idx in range(0, len(domains)) :
                                     train_dataset = np.array([np.array(article[1:-1]) for article in train_featuresets[train_idx]])
                                     train_label = np.array([np.array(int(article[-1])) for article in train_featuresets[train_idx]])
                                     test_dataset = np.array([np.array(article[1:-1]) for article in test_featuresets[test_idx]])
                                     test_label = np.array([np.array(int(article[-1])) for article in test_featuresets[test_idx]])
                                     classifier = SvmClassifier()
                                     train_dataset = classifier.normalize(train_dataset, method=norm)
                                     test_dataset = classifier.normalize(test_dataset, method=norm)
                                     classifier.training(train_dataset, train_label, cset=c, kernel=kernel)
                                     test_prob = classifier.testing(test_dataset, type='prob')
                                     test_class = classifier.testing(test_dataset, type='label')
                                     evl.append(classifier.evaluation(test_label, test_prob, test_class)[1])
                             print 'single finished ...'
                             # merge
                             articles = list()
                             for train_idx in range(0, len(domains)) :
                                 articles.extend(train_featuresets[train_idx])
                             train_dataset = np.array([np.array(article[1:-1]) for article in articles])
                             train_label = np.array([np.array(int(article[-1])) for article in articles])
                             for test_idx in range(0, len(domains)) :
                                 test_dataset = np.array([np.array(article[1:-1]) for article in test_featuresets[test_idx]])
                                 test_label = np.array([np.array(int(article[-1])) for article in test_featuresets[test_idx]])
                                 classifier = SvmClassifier()
                                 train_dataset = classifier.normalize(train_dataset, method=norm)
                                 test_dataset = classifier.normalize(test_dataset, method=norm)
                                 classifier.training(train_dataset, train_label, cset=c, kernel=kernel)
                                 test_prob = classifier.testing(test_dataset, type='prob')
                                 test_class = classifier.testing(test_dataset, type='label')
                                 evl.append(classifier.evaluation(test_label, test_prob, test_class)[1])
                             print 'merge finished ...'
                             print 'performance is', 1.0*sum(evl)/len(evl)
                             log = [w, combined, weight, kernel, c[0], norm]
                             log.extend(evl)
                             logger.append(log)
     file_operator = TextFileOperator()
     file_operator.writing(logger, logger_path)
     print 'finish'