示例#1
0
 def __data_generation(self, indices):
     x = np.empty((self.batch_size, *self.img_size))
     if self.labels is not None:  # training phase
         if self.n_classes == 2:
             y = np.empty((self.batch_size, ), dtype=np.float32)
         else:
             y = np.empty((self.batch_size, self.n_classes),
                          dtype=np.float32)
         for i, idx in enumerate(indices):
             image = Preprocessor.preprocess(self.img_dir +
                                             self.list_ids[idx] + ".dcm")
             if self.labels.iloc[idx]['any'] == 1:
                 image = self.augment_funcs[random.randint(
                     0, self.n_augment)](image)
             image = np.array(image)
             image = np.repeat(image[..., np.newaxis], 3, -1)
             x[i, ] = image
             if self.n_classes == 2:
                 y[i, ] = self.labels.iloc[idx]['any']
             elif self.n_classes == 5:
                 y[i, ] = self.labels.iloc[idx, 1:]
             else:
                 y[i, ] = self.labels.iloc[idx]
         return x, y
     else:  # test phase
         for i, idx in enumerate(indices):
             image = Preprocessor.preprocess(self.img_dir +
                                             self.list_ids[idx] + ".dcm")
             image = np.repeat(image[..., np.newaxis], 3, -1)
             x[i, ] = image
         return x
示例#2
0
def process_data(data):
    neutralIndices = []
    emptyIndices = []
    labels = data.iloc[:, 0].tolist()
    for i in range(len(labels)):
        if labels[i] == -2:
            data.at[i, 2] = -1
        elif labels[i] == 2:
            data.at[i, 2] = 1
        #elif labels[i] == 0:
        #    neutralIndices.append(i)
    #processed = data.drop(neutralIndices)
    processed = data
    #upsample class
    multiplier = upsample_multiplier(processed, -1, 1)
    isNeg = processed[2] == -1
    df_try = processed[isNeg]
    data2 = processed.append([df_try] * multiplier, ignore_index=True)

    multiplier = upsample_multiplier(processed, 0, 1)
    isNeut = processed[2] == 0
    df_try = processed[isNeut]
    data2 = data2.append([df_try] * multiplier, ignore_index=True)

    labels2 = data2.iloc[:, 0].tolist()
    #for i in range(len(labels2)):
    #    if labels2[i] == -1:
    #        data2.at[i,2] = 0

    p = Preprocessor()
    texts = data2.iloc[:, 1].tolist()
    processed_texts = p.preprocess(texts)
    labels = data2.iloc[:, 0].tolist()
    write_csv(processed_texts, labels)
def main():
    pp = Preprocessor()
    print 'processing custom data, computing bows...'
    tdpath = 'dataset/test/sms-data'
    pp.process_custom_data(tdpath)
    
    fm = FeatureModel()
    print 'converting custom data to fvs...'
    fm.compute_custom_fv_matrix('custom')
    
    tdpath = 'bin_data/custom_fv.npy'
    cpath = 'bin_data/mnb-classifier.npy'
    data = np.load('bin_data/custom-data.npy').item()
    
    tester = Tester(tdpath,cpath)
    print 'predicting labels for custom data...'
    results = tester.predict_labels_for_custom_data(data)
    
    with open('output/results.txt','w') as textfile:
        for msg in results:
            line = '%s -> %s\n' % (msg,results[msg])
            textfile.write(line)
        
        textfile.close()
    
    print 'Results written to results.txt'
示例#4
0
 def __init__(self):
     self.preProcessor = Preprocessor()
     self.lstmSize = 256
     self.lstmLayers = 1
     self.batchSize = 500
     self.learningRate = 0.0005
     self.seqL = 100
示例#5
0
    def __init__(self,
                 repo_path,
                 model_type,
                 use_translated_data,
                 term_similarity_type,
                 lang_code,
                 link_threshold_interval=5,
                 output_sub_dir="",
                 print_result=False,
                 github_projects_dir=default_git_dir):
        """

        :param repo_path: the repo path in github
        :param model_type: vsm, gvsm, lda
        :param use_translated_data: whether use translated data or not
        :param term_similarity_type: for gvsm only.
        :param link_threshold_interval: The sample rate for threshold
        :param output_sub_dir: the sub directory for results under Experiment2/result/. Group the experiment by the time running the script
        """
        self.git_projects_dir = github_projects_dir
        self.use_translated_data = use_translated_data
        self.model_type = model_type
        self.repo_path = repo_path
        self.lang_code = lang_code
        self.data_dir = os.path.join(self.git_projects_dir, repo_path)
        self.preprocessor = Preprocessor()
        self.preprocessed_dataset()  # Create clean tokens if not exist
        self.link_threshold_interval = link_threshold_interval
        self.term_similarity_type = term_similarity_type
        self.output_sub_dir = output_sub_dir
示例#6
0
    def runModel(self, testSize, debug):
        self.trainVectorizer()
        d = self.getXy('train.tsv')
        if debug:
            X_train, X_test, y_train, y_test = train_test_split(
                d['X'], d['y'], test_size=testSize, random_state=5)
        else:
            X_train = d['X']
            y_train = d['y']
            d_test = self.getXy('test.tsv')
            X_test = d_test['X']
            urlid = d_test['urlid']

        self.fit(X_train, y_train)
        print "20 Fold CV Score: ", np.mean(
            cross_val_score(self.model,
                            d['X'],
                            d['y'],
                            cv=10,
                            scoring='roc_auc'))
        y_predicted = self.predict(X_test)

        if debug:
            print 'Topic Model AUC Score: %f' % roc_auc_score(
                y_test, y_predicted)
        else:
            Pre = Preprocessor()
            Pre.generateSubmission('submission_12.csv', urlid, y_predicted)

        P.figure()
        P.hist(y_predicted, bins=100)
        P.show()
示例#7
0
    def runModel(self, testSize, debug):
        self.trainVectorizer()
        d = self.getXy('train.tsv')
        if debug:
            X_train, X_test, y_train, y_test = train_test_split(d['X'], d['y'], test_size=testSize, random_state=5)
        else:
            X_train = d['X']
            y_train = d['y']
            d_test = self.getXy('test.tsv')
            X_test = d_test['X']
            urlid = d_test['urlid']

        self.fit(X_train, y_train)
        print "20 Fold CV Score: ", np.mean(cross_val_score(self.model, d['X'], d['y'], cv=10, scoring='roc_auc'))
        y_predicted = self.predict(X_test)

        if debug:
            print 'Topic Model AUC Score: %f' % roc_auc_score(y_test, y_predicted)
        else:
            Pre = Preprocessor()
            Pre.generateSubmission('submission_12.csv', urlid, y_predicted)

        P.figure()
        P.hist(y_predicted, bins=100)
        P.show()
示例#8
0
    def integrate_images(self):

        self.judge_user_input_or_not()

        image_type_list = list()
        # image_type_list.append(self.var_char1.get())
        # image_type_list.append(self.var_char2.get())
        # image_type_list.append(self.var_char3.get())
        # image_type_list.append(self.var_char4.get())

        csv_handler = CSVParser(self.csv_input_path.get())
        data_dict = csv_handler.get_dict_from_csv()
        integrate_handler = Preprocessor(self.image_input_path.get(),
                                         self.text_output_path.get())

        # group_number = integrate_handler.get_group_number_all()

        self.img_group_number.set(u"Total number: " + str(len(data_dict)) +
                                  u" groups")

        # keyword_dict = integrate_handler.get_keyword_dict()
        for k, v in data_dict.iteritems():
            return_message = integrate_handler.integrate_images(v)
            if len(return_message) > 0:
                self.t_show.insert(
                    END, u"Ref no. " + return_message +
                    u" has not been finished!\n")
示例#9
0
class TestPreprocessor(unittest.TestCase):
	def setUp(self):
		cur_dir = os.path.dirname(os.path.realpath(__file__))
		self.tpl_dir = os.path.join(cur_dir,'..','UnitTests','test_templates')
		self.preprocessor = Preprocessor(self.tpl_dir)

	def process_content(self,content,dict,expected):
		actual = self.preprocessor.process_content_with_dict(content,dict)
		self.assertEqual(actual,expected)

	def test_ifs(self):
		dict = {'basic_key':'_trivial_', 'yes_key':True, 'no_key':False}
		self.process_content("basic%basic_key%basic",dict,"basic_trivial_basic")
		self.process_content("%if yes_key%%basic_key%%endif%",dict,"_trivial_")
		self.process_content("%if no_key%%basic_key%%endif%",dict,"")
		self.process_content("aa%if yes_key%%if no_key%%basic_key%%endif%%endif%bb",dict,"aabb")
		self.process_content("start %if non_existant_key%non_existant%endif% finish",dict,"start  finish")
		self.process_content("start %if non_existant_key%%if yes_key%%basic_key%%endif%%endif% finish",dict,"start  finish")
		self.process_content("start %if yes_key%yes%else%no%endif%",dict,"start yes")
		self.process_content("start %if no_key%yes%else%no%endif%",dict,"start no")
		self.process_content("start %if non_existant_key%yes%else%non_exist%endif%",dict,"start non_exist")

		self.process_content("%if yes_key%%if no_key%yes%else%no%endif%%endif%",dict,"no")
		self.process_content("%if no_key% hello1 %else% %if yes_key% hello2 %else% hello3 %endif% %endif%",dict,"  hello2  ")


	def test_includes(self):
		content = self.preprocessor.process_tpl_name_with_dict('test_include',{'a':True,'aa':'a'})
		self.assertEqual(content,'<html><body> a </body></html>')
示例#10
0
class TestPreprocessor(unittest.TestCase):
    def setUp(self):
        self.pp = Preprocessor()

    def testPreporcessNull(self):
        #Null String should return none
        result = self.pp.clean('')
        self.assertEquals(result, None)

    def testPreporcessOneHiragana(self):
        #One hiragana should return None
        result = self.pp.clean(u'あw(^^)w')
        self.assertEquals(result, None)

    def testPreporcessZenkaku(self):
        #Zenkaku stuff should be converted and cleaned
        result = self.pp.clean(u'全角です123ww')
        self.assertEquals(result, u'全角です123')

    def testPreprocessNakano(self):
        #Test to see if the Preprocessing work as intended
        test_string = u'私の名前は中野ですwwww>あふぉ(^^)o'
        result = self.pp.clean(test_string)
        expected = u'私の名前は中野です'
        self.assertEquals(result, expected)
示例#11
0
def get_preprocessor(config, features):
	print("Fitting preprocessor...")
	preprocessor = Preprocessor(normalize=config["normalize"],
								reduce_features=config["reduce_features"],
								reducer_type=config["reducer_type"],
								explained_variance=config["explained_variance"])
	preprocessor.train(features)
	return preprocessor
示例#12
0
 def __init__(self,
              outputFormat='flat',
              useJarfileManifest=True,
              useChromeManifest=False):
     self.outputFormat = outputFormat
     self.useJarfileManifest = useJarfileManifest
     self.useChromeManifest = useChromeManifest
     self.pp = Preprocessor()
示例#13
0
 def __init__(self):
     self.root = Tk()
     self.root.title('New')
     self.file = None
     self.textChanged = False
     self.idh = IDHolder()
     self.pre = Preprocessor()
     self.lxa = LexAnalyzer(self.idh)
     self.mdc = MidCoder(self.idh)
示例#14
0
 def evaluate_text(self, text):
     pre = Preprocessor()
     entry = TextEntry()
     entry.body = text
     pre.entries = [entry]
     predict = self.clf.predict(pre.get_clean_data())
     if self.clf2 is not None:
         predict2 = self.clf2.predict(pre.get_clean_data())
         return (predict + predict2) / 2
     return predict
示例#15
0
    def create_config_file(self, path, extra=None):
        '''Creates the given config file. A config file is generated by
        taking the corresponding source file and replacing occurences of
        "@VAR@" by the value corresponding to "VAR" in the substs dict.

        Additional substs are defined according to the file being treated:
            "srcdir" for its the path to its source directory
            "relativesrcdir" for its source directory relative to the top
            "DEPTH" for the path to the top object directory
        '''
        input = self.get_input(path)
        pp = Preprocessor()
        pp.context.update(self.substs)
        pp.context.update(top_srcdir=self.get_top_srcdir(path))
        pp.context.update(srcdir=self.get_file_srcdir(path))
        pp.context.update(relativesrcdir=self.get_relative_srcdir(path))
        pp.context.update(DEPTH=self.get_depth(path))
        if extra:
            pp.context.update(extra)
        pp.do_filter('attemptSubstitution')
        pp.setMarker(None)

        pp.out = FileAvoidWrite(path)
        pp.do_include(input)
        return pp.out.close()
示例#16
0
    def preprocess(self, path_sample, dbnum, week_num, week_stride):
        logging.info('........................... Preprocessing Data ..................................')
        self.week_num = week_num
        self.week_stride = week_stride
        self.eval_week = range(self.week_num+1, self.week_num+1+self.week_stride)
        
        logging.info('Fitting evaluator with week splits at %d', week_num)
        
        self.preprocessor = Preprocessor(path_sample, dbnum,
                                         self.weight_post_lookup, self.weight_user_lookup, 
                                         self.start_date, verbose=self.verbose)
        
        # preprocessor.preview_interaction_distribution()
        
        # Return every interaction incurred before cutoff week as the training interaction
        # every interaction incurred after cutoff week as the testing interaction
        # post interactions are user2post, user interactions are user2user
        self.train_post_inter, self.test_post_inter, \
        self.train_user_inter, self.test_user_inter = self.preprocessor.partition_data(self.week_num)

        logging.info('At week %d, recommend for upto week %d', self.week_num, self.eval_week)
        # users = [pid for pid in self.train_post_inter.keys()]  # Note that this gives all users, though some have 0 interactions
        self.users = set(self.train_post_inter.keys()).union(set(self.test_post_inter.keys()))
        logging.info('- Currently, %d users played in this forum', len(self.users))
        logging.info('- in total, %d users are enrolled in this forum', len(self.users))
        
        logging.info('Calculating the overall interactions...')
        self.all_post_inters = {pid: pd.concat([self.train_post_inter[pid], self.test_post_inter[pid]]) 
                                for pid in self.users}
        #print([len(x) for x in self.all_post_inters.values()])
        all_post_inters_df = pd.DataFrame()
        for pid, df in self.all_post_inters.items():
            df['PersonID'] = pid
            all_post_inters_df = all_post_inters_df.append(df)
        self.all_post_inters_df = all_post_inters_df
        #logging.debug(self.all_post_inters_df.Weeknum.unique())
        
        # Essentially a dictionary of dictionary, keys are the [pid]'s, 
        # because each [pid] correspond to a differnt test set
        for pid in self.users:
            self.eval_nids_per_person[pid] = self.extract_evaluation_inters_for_pid(pid) 
        
        all_notes = set()
        num_active_users = 0
        for pid, inter in self.train_post_inter.items():
            all_notes = all_notes.union(set(inter['NoteID'].unique()))
            if len(inter):
                num_active_users += 1
        logging.info('- the forum currently has %d posts', len(all_notes))
        logging.info('- %d users have made interactions', num_active_users)
        
        self.unrec_noteids = self.preprocessor.get_unsharable_posts()
        self.hierarchy = self.preprocessor.hierarchy
        self.all_note_contents = self.preprocessor.all_note_contents
示例#17
0
def trainer(op, file):

    p = Preprocessor()
    X_train, labels_train, X_test, labels_test = p.load_data()
    Y_train = one_hot_matrix(labels=labels_train, C=10)
    Y_test = one_hot_matrix(labels=labels_test, C=10)

    X_train = X_train.T / 255
    X_test = X_test.T / 255

    model(X_train, Y_train, X_test, Y_test, op, file)
示例#18
0
文件: main.py 项目: mlynarzsrem/IWM
def train():
    fileLoader = FileLoader("data/orginal", "data/result")
    files = fileLoader.getFilePairs()
    samples = []
    print("Sample extracting")
    for file in files:
        sampleExtracter = SampleExtracter(file[0], file[1], 10)
        samples += sampleExtracter.getSamples()
    print("Preprocessing")
    p = Preprocessor(samples)
    samples = p.getTrainingData()
    c = Classifier(samples[:100000], 10)
示例#19
0
 def __init__(self,
              isReversed=False,
              isStop=False,
              isStem=False,
              default=None):
     dict.__init__(self)
     self.default = default
     self._isReversed = isReversed
     self._isStop = isStop
     self._isStem = isStem
     self.__pre = Preprocessor(isReversed=self._isReversed,
                               isStop=self._isStop,
                               isStem=self._isStem)
示例#20
0
 def __init__(self, outputFormat = 'flat', useJarfileManifest = True,
              useChromeManifest = False):
   self.outputFormat = outputFormat
   self.useJarfileManifest = useJarfileManifest
   self.useChromeManifest = useChromeManifest
   self.pp = Preprocessor()
   self.topsourcedir = None
   self.sourcedirs = []
   self.localedirs = None
   self.l10nbase = None
   self.l10nmerge = None
   self.relativesrcdir = None
   self.rootManifestAppId = None
示例#21
0
文件: Server.py 项目: jeffasd/Finch
    def pictures_html_block(self):
        pictures_preprocessor = Preprocessor(tpl_dir)
        dict = {
            "img_size_l": "1836x2448",
            "img_size_m": "1224x1632",
            "img_size_s": "612x816",
            "img_width_t": 150,
            "img_height_t": 150,
        }

        text = ""
        for i in xrange(1, 16):
            text = text + pictures_preprocessor.process_tpl_name_with_dict("asset", dict)
        return text
示例#22
0
文件: Server.py 项目: jeffasd/Finch
 def album_list_html_block(self):
     albums_preprocessor = Preprocessor(tpl_dir)
     dict = {
         "number_of_pictures": 152,
         "album_name": "My Photos",
         "album_share_href": "/album.html",
         "img_width_t": 150,
         "img_height_t": 150,
         "poster_image_src": "/test_image.jpeg",
     }
     text = ""
     for i in xrange(1, 4):
         text = text + albums_preprocessor.process_tpl_name_with_dict("album_list_item", dict)
     return text
示例#23
0
    def create_config_file(self, path, extra=None):
        """Creates the given config file. A config file is generated by
        taking the corresponding source file and replacing occurences of
        "@VAR@" by the value corresponding to "VAR" in the substs dict.

        Additional substs are defined according to the file being treated:
            "srcdir" for its the path to its source directory
            "relativesrcdir" for its source directory relative to the top
            "DEPTH" for the path to the top object directory
        """
        input = self.get_input(path)
        pp = Preprocessor()
        pp.context.update(self.substs)
        pp.context.update(top_srcdir=self.get_top_srcdir(path))
        pp.context.update(srcdir=self.get_file_srcdir(path))
        pp.context.update(relativesrcdir=self.get_relative_srcdir(path))
        pp.context.update(DEPTH=self.get_depth(path))
        if extra:
            pp.context.update(extra)
        pp.do_filter("attemptSubstitution")
        pp.setMarker(None)

        pp.out = FileAvoidWrite(path)
        pp.do_include(input)
        return pp.out.close()
示例#24
0
def load_terminal_design_data(raw_dataset_path, grammar_file):
    graphs = rd.load_graphs(grammar_file)
    rules = [rd.create_rule_from_graph(g) for g in graphs]

    all_labels = set()
    for rule in rules:
        for node in rule.lhs.nodes:
            all_labels.add(node.attrs.label)
    all_labels = sorted(list(all_labels))

    preprocessor = Preprocessor(all_labels=all_labels)

    with open(raw_dataset_path, newline='') as log_file:
        reader = csv.DictReader(log_file)

        all_link_features = []
        all_link_adj = []
        all_results = []
        max_nodes = 0
        for row in reader:
            rule_seq = ast.literal_eval(row['rule_seq'])
            result = float(row['result'])

            all_results.append(result)

            # Build a robot from the rule sequence
            robot_graph = make_initial_graph()
            for r in rule_seq:
                matches = rd.find_matches(rules[r].lhs, robot_graph)
                # Always use the first match
                robot_graph = rd.apply_rule(rules[r], robot_graph, matches[0])

            adj_matrix, link_features, _ = preprocessor.preprocess(robot_graph)

            all_link_features.append(link_features)
            all_link_adj.append(adj_matrix)

            max_nodes = max(max_nodes, adj_matrix.shape[0])

        all_adj_matrix_pad, all_link_features_pad, all_masks = [], [], []
        for adj_matrix, link_features in zip(all_link_adj, all_link_features):
            adj_matrix_pad, link_features_pad, masks = preprocessor.pad_graph(
                adj_matrix, link_features, max_nodes=max_nodes)
            all_adj_matrix_pad.append(adj_matrix_pad)
            all_link_features_pad.append(link_features_pad)
            all_masks.append(masks)

    return all_link_features_pad, all_adj_matrix_pad, all_masks, all_results
示例#25
0
class TestPreprocessor(unittest.TestCase):
    def setUp(self):
        cur_dir = os.path.dirname(os.path.realpath(__file__))
        self.tpl_dir = os.path.join(cur_dir, '..', 'UnitTests',
                                    'test_templates')
        self.preprocessor = Preprocessor(self.tpl_dir)

    def process_content(self, content, dict, expected):
        actual = self.preprocessor.process_content_with_dict(content, dict)
        self.assertEqual(actual, expected)

    def test_ifs(self):
        dict = {'basic_key': '_trivial_', 'yes_key': True, 'no_key': False}
        self.process_content("basic%basic_key%basic", dict,
                             "basic_trivial_basic")
        self.process_content("%if yes_key%%basic_key%%endif%", dict,
                             "_trivial_")
        self.process_content("%if no_key%%basic_key%%endif%", dict, "")
        self.process_content(
            "aa%if yes_key%%if no_key%%basic_key%%endif%%endif%bb", dict,
            "aabb")
        self.process_content(
            "start %if non_existant_key%non_existant%endif% finish", dict,
            "start  finish")
        self.process_content(
            "start %if non_existant_key%%if yes_key%%basic_key%%endif%%endif% finish",
            dict, "start  finish")
        self.process_content("start %if yes_key%yes%else%no%endif%", dict,
                             "start yes")
        self.process_content("start %if no_key%yes%else%no%endif%", dict,
                             "start no")
        self.process_content(
            "start %if non_existant_key%yes%else%non_exist%endif%", dict,
            "start non_exist")

        self.process_content(
            "%if yes_key%%if no_key%yes%else%no%endif%%endif%", dict, "no")
        self.process_content(
            "%if no_key% hello1 %else% %if yes_key% hello2 %else% hello3 %endif% %endif%",
            dict, "  hello2  ")

    def test_includes(self):
        content = self.preprocessor.process_tpl_name_with_dict(
            'test_include', {
                'a': True,
                'aa': 'a'
            })
        self.assertEqual(content, '<html><body> a </body></html>')
示例#26
0
def get_preprocessor_names():
    result = []

    for clazz in Preprocessor.__subclasses__():
        result.append(clazz.get_name())

    return result
def get_preprocessor_names():
  result = []

  for clazz in Preprocessor.__subclasses__():
    result.append(clazz.get_name())

  return result
 def getEdgeDistByMask(self, mask3D, setID, sigma=4.5):
     result = Preprocessor.loadThresholdMask(setID)
     #result = generic_gradient_magnitude(result, sobel).astype(np.float32)
     #result = nd.filters.gaussian_filter(result, sigma)
     result = morph.distance_transform_cdt(result, metric='taxicab').astype(
         np.float32)
     return result[mask3D]
示例#29
0
 def __init__(self, fo_lang_code):
     # set up stanford nlp java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload
     # tokenize,ssplit,pos,lemma,parse,depparse  -status_port 9000 -port 9000 -timeout 15000 -serverProperties StanfordCoreNLP-
     # chinese.properties
     self.parser = CoreNLPParser()
     self.fo_lang_code = fo_lang_code
     self.preprocessor = Preprocessor()
示例#30
0
文件: run_stl10.py 项目: yzqttt/LCI
def main(_):
    # Define data pre-processors
    load_shape = [80, 80, 3]
    shape_transfer = [64, 64, 3]
    crop_sz = (64, 64)
    preprocessor = Preprocessor(target_shape=load_shape, src_shape=(96, 96, 3))
    preprocessor_lin = Preprocessor(target_shape=shape_transfer, src_shape=(96, 96, 3))

    # Initialize the data generators
    data_gen_ssl = STL10('train_unlabeled')
    data_gen_ftune = STL10('train')
    data_test = STL10('test')

    # Define the network and SSL training
    model = TRCNet(batch_size=FLAGS.batch_size, im_shape=load_shape, n_tr_classes=6, tag=FLAGS.tag,
                   lci_patch_sz=42, lci_crop_sz=48, n_layers_lci=4, ae_dim=48,
                   enc_params={'padding': 'SAME'})
    trainer = CINTrainer(model=model, data_generator=data_gen_ssl, pre_processor=preprocessor, crop_sz=crop_sz,
                         wd_class=FLAGS.wd, init_lr_class=FLAGS.pre_lr,
                         num_epochs=FLAGS.n_eps_pre, num_gpus=FLAGS.num_gpus,
                         optimizer='adam', init_lr=0.0002, momentum=0.5,  # Parameters for LCI training only
                         train_scopes='features')
    trainer.train_model(None)

    # Get the final checkpoint
    ckpt_dir_model = trainer.get_save_dir()
    ckpt = wait_for_new_checkpoint(ckpt_dir_model, last_checkpoint=None)
    print('Found checkpoint: {}'.format(ckpt))
    ckpt_id = ckpt.split('-')[-1]

    # Train linear classifiers on frozen features
    tag_class = '{}_classifier_ckpt_{}'.format(FLAGS.tag, ckpt_id)
    model = TRCNet(batch_size=FLAGS.batch_size_ftune, im_shape=shape_transfer, tag=tag_class,
                   feats_ids=['conv_1', 'conv_2', 'conv_3', 'conv_4', 'conv_5'],
                   enc_params={'use_fc': False, 'padding': 'SAME'})
    trainer_class = ClassifierTrainer(model=model, data_generator=data_gen_ftune, pre_processor=preprocessor_lin,
                                      optimizer='momentum', init_lr=FLAGS.ftune_lr, momentum=0.9,
                                      num_epochs=FLAGS.n_eps_ftune, num_gpus=1,
                                      train_scopes='classifier')
    trainer_class.train_model(ckpt)
    ckpt_dir = trainer_class.get_save_dir()

    # Evaluate on the test set
    model.batch_size = 100
    tester = ClassifierTester(model=model, data_generator=data_test, pre_processor=preprocessor_lin)
    acc = tester.test_classifier(ckpt_dir)
    write_experiments_multi(acc, tag_class, FLAGS.tag)
    def __init__(self,
                 sess,
                 params,
                 batch_size=256,
                 sample_size=64,
                 epochs=1000,
                 image_shape=[256, 256, 3],
                 y_dim=None,
                 z_dim=0,
                 gf_dim=128,
                 df_dim=64,
                 gfc_dim=512,
                 dfc_dim=1024,
                 c_dim=3,
                 cg_dim=1,
                 is_train=True,
                 random_seed=4285):
        self.model_name = "DCGAN.model"
        self.sess = sess
        self.batch_size = batch_size
        self.sample_size = sample_size
        self.epochs = epochs

        self.image_shape = image_shape
        self.image_size = image_shape[0]

        self.y_dim = y_dim
        self.z_dim = z_dim
        self.z = None

        self.gf_dim = gf_dim
        """ gf_dim: Dimension of gen (ie decoder of AE) filters in first conv layer. [128] """
        self.df_dim = df_dim
        """ df_dim: Dimension of discrim (ie Dsc + encoder of AE) filters in first conv layer. [64] """

        self.gfc_dim = gfc_dim
        """ as of 28.9: not used """
        self.dfc_dim = dfc_dim
        """ as of 28.9: not used """

        self.c_dim = c_dim
        """ c_dim: Dimension of image color. [3] """
        self.cg_dim = cg_dim
        """ as of 28.9: not used """

        self.params = params

        self.end = False

        self.random_seed = random_seed

        self.isIdeRun = 'lz826' in os.path.realpath(sys.argv[0])

        self.isTraining = True

        target_shape = [self.image_size, self.image_size, 3]
        DCGAN.img_preprocessor = Preprocessor(target_shape=target_shape)

        self.build_model()
示例#32
0
 def __init__(self, list_of_files, list_of_models, ratios_list, need_to_make_models = True):
     if len(list_of_files) != len(list_of_models):
         raise ValueError("list of files must be same length as list of models as each file needs its own model")
     for index in range(len(list_of_files)):
         preprocessor_to_add = Preprocessor(list_of_files[index], list_of_models[index], need_to_create_model = need_to_make_models)
         self.preprocessor_list.append(preprocessor_to_add)
     self.ratios_list = ratios_list
     self.line_number_list = self.preprocessor_list[0].get_random_line_numbers(self.ratios_list)
示例#33
0
 def get_authors_and_title(text):
     #print text.encode('utf8')
     pattern = u'\x14(.*)\x15'
     m = re.search(pattern, text.split('\n')[0])
     all = m.group(1)
     #print all.encode('utf8')
     authors, title = Preprocessor.extract_authors(all)
     return authors,title
示例#34
0
 def __init__(self,
              outputFormat='flat',
              useJarfileManifest=True,
              useChromeManifest=False):
     self.outputFormat = outputFormat
     self.useJarfileManifest = useJarfileManifest
     self.useChromeManifest = useChromeManifest
     self.pp = Preprocessor()
示例#35
0
 def __init__(self, outputFormat = 'flat', useJarfileManifest = True,
              useChromeManifest = False):
   self.outputFormat = outputFormat
   self.useJarfileManifest = useJarfileManifest
   self.useChromeManifest = useChromeManifest
   self.pp = Preprocessor()
   self.topsourcedir = None
   self.sourcedirs = []
   self.localedirs = None
示例#36
0
 def __init__(self, preprocessor=None, model=None):
     self.preprocessor = preprocessor
     if self.preprocessor is None:
         self.preprocessor = Preprocessor()
         
     if self.preprocessor.pos_words == []:
         with open('./preprocessor.pkl', 'rb') as file:
             (self.preprocessor.pos_words, 
              self.preprocessor.neg_words, 
              self.preprocessor.ohe_dc, 
              self.preprocessor.ohe_out_columns) = pickle.load(file)
         
     self.parser = Parser(MONEY)
     
     if model is None:
         self.model = Model()
         self.model.load('./models')
     else:
         self.model = model
示例#37
0
def main(config_filename):
    logger.debug("Starting execution.")
    parameters = Parameters(config_filename, training_mode=True)
    if parameters.preprocessed_data:
        if not isfile(parameters.excel_file) and not isfile(parameters.preprocessed_data_file):
            logger.error("Please, provide a valid Excel file or a valid preprocessed data file.")
            quit()
        if not isfile(parameters.preprocessed_data_file) and isfile(parameters.excel_file):
            logger.info("Loading Excel file.")
            data_frame = read_excel(parameters.excel_file)
            logger.info("Creating documents.")
            docs = data_frame_to_document_list(data_frame)
            logger.info("Storing generated documents.")
            pickle_manager.dump_documents(docs, parameters.preprocessed_data_file)
        logger.info("Preprocessing documents.")
        preprocessor = Preprocessor(stanfordnlp_language_package=parameters.stanfordnlp_language_package, stanfordnlp_use_gpu=parameters.stanfordnlp_use_gpu, stanfordnlp_resources_dir=parameters.stanfordnlp_resources_dir, training_mode=parameters.training_mode)
        preprocessor.preprocess(text_field=parameters.excel_column_with_text_data, preprocessed_data_file=parameters.preprocessed_data_file)
        logger.info("Checking generated data.")
        pickle_manager.check_data(parameters.preprocessed_data_file)
    else:
        if not isfile(parameters.preprocessed_data_file):
            logger.error("The indicated preprocessed data file does not exist.")
            quit()
    logger.info("Extracting features.")
    feature_extractor = FeatureExtractor(nltk_stop_words_package=parameters.nltk_stop_words_package, vectorizer_name=parameters.vectorizer, training_mode=parameters.training_mode, use_lda=parameters.use_lda, document_adjustment_code=parameters.document_adjustment_code, remove_adjectives=parameters.remove_adjectives, synonyms_file=parameters.synonyms_file, features_file=parameters.features_file)
    X, y, _lemmas = feature_extractor.generate_X_y(class_field=parameters.excel_column_with_classification_data, preprocessed_data_file=parameters.preprocessed_data_file)
    logger.info("Splitting dataset into training and test subsets.")    
    train_test_split(y, parameters.test_subset_size, parameters.preprocessed_data_file, parameters.force_subsets_regeneration)
    logger.info("Running classifiers.")
    p = classifiers.Pipeline(parameters.classifiers, parameters.cross_validate)
    metadata = pickle_manager.get_docs_metadata(parameters.preprocessed_data_file)
    training_set_indexes = metadata['training_set_indexes'].tolist()
    test_set_indexes = metadata['test_set_indexes'].tolist()
    assert len(training_set_indexes) == len(set(training_set_indexes))
    assert len(test_set_indexes) == len(set(test_set_indexes))
    for elem in feature_extractor.to_remove:
        try:
            training_set_indexes.remove(elem)
        except ValueError:
            test_set_indexes.remove(elem)
    logger.info("Accuracies:")
    p.start(X, y, parameters.number_of_jobs, parameters.set_num_accepted_probs, training_set_indexes, test_set_indexes, parameters.resampling)
    logger.debug("Execution completed.")
示例#38
0
 def _init_test_images(self):
     real_test_image = Preprocessor.preprocess('data/ID_000178e76.dcm')
     test_images = [
         np.zeros((512, 512)),
         np.ones((512, 512)),
         np.random.rand(512, 512), real_test_image
     ]
     for i in range(len(test_images)):
         test_images[i] = np.repeat(test_images[i][..., np.newaxis], 3, -1)
     return np.array(test_images)
示例#39
0
def main():
    df = load_raw_data(kci_korean_json_filepath)
    
    preprocessor = Preprocessor()
    df = raw2sentences(preprocessor, df)
    
    df['flattened_sentences'] = df.apply(lambda x: ' '.join(x['sentences']),axis=1)
    stopwords = preprocessor.stopwords(df['flattened_sentences'], min_df)
    
    print('Extracting nouns..')
    df['nouns'] = df.apply(lambda x: preprocessor.line2words_nouns(x['flattened_sentences'], stopwords, remove_len=remove_len), axis=1)
    
    whole_sentences = preprocessor.flatten_whole_sentences(df, 'nouns')
    print('# of documents = %d' % len(whole_sentences))
    
    # save as .txt
    f = open(whole_units_for_train_txt_filepath, 'w')
    for i in range(len(whole_sentences)):
        data = "%s\n" % whole_sentences[i]
        f.write(data)
    f.close()
    print('Created file:', whole_units_for_train_txt_filepath)
    
    # process text to tensor
    loader = Quantizer(whole_sentences)
    word_vocab_size = min(n_words, len(loader.idx2word))
    char_vocab_size = min(n_chars, len(loader.idx2char))
    max_word_l = loader.max_word_l
    print('Word vocab size: %d, Char vocab size: %d, Max word length (incl. padding): %d' % (word_vocab_size, char_vocab_size, max_word_l))
    
    log_content = '\n=====\n# of stopwords=%d \n%s\n=====\n# of unique words=%d \n# of unique chars=%d \nmaximum length of a word=%d \n=====\n' % (len(stopwords), str(stopwords), word_vocab_size, char_vocab_size, max_word_l)
    write_log(log_dir, 'preprocessing_vocab.log', log_content)

    print('creating an LSTM-CNN with', num_layers, 'layers')
    model = LSTMCNN(char_vocab_size, char_vec_size, feature_maps, kernels, batch_size, seq_length, max_word_l, batch_norm, highway_layers, num_layers, rnn_size, dropout, word_vocab_size, learning_rate, max_grad_norm)
        
    pickle.dump(parameters, open(model_param_pkl_filepath, "wb"))
    model.save(model_json_filepath)
    
    Train, Validation, Test = 0, 1, 2
    model.fit_generator(loader.next_batch(Train), loader.split_sizes[Train], max_epochs, loader.next_batch(Validation), loader.split_sizes[Validation], decay_when, learning_rate_decay, save_every, save_epoch_file)
    model.save_weights(model_weights_h5_filepath, overwrite=True)
def get_new_instance(preprocessor_type):
  result = None

  for clazz in Preprocessor.__subclasses__():
    if clazz.get_name() == preprocessor_type:
      result = clazz()
      break

  if result is None:
    raise ValueError("Cannot find retriever of type %s" % (preprocessor_type, ))

  return result
示例#41
0
class TestLineEndings(unittest.TestCase):
  """
  Unit tests for the Context class
  """

  def setUp(self):
    self.pp = Preprocessor()
    self.pp.out = StringIO()
    self.tempnam = os.tempnam('.')

  def tearDown(self):
    os.remove(self.tempnam)

  def createFile(self, lineendings):
    f = open(self.tempnam, 'wb')
    for line, ending in zip(['a', '#literal b', 'c'], lineendings):
      f.write(line+ending)
    f.close()

  def testMac(self):
    self.createFile(['\x0D']*3)
    self.pp.do_include(self.tempnam)
    self.assertEquals(self.pp.out.getvalue(), 'a\nb\nc\n')

  def testUnix(self):
    self.createFile(['\x0A']*3)
    self.pp.do_include(self.tempnam)
    self.assertEquals(self.pp.out.getvalue(), 'a\nb\nc\n')

  def testWindows(self):
    self.createFile(['\x0D\x0A']*3)
    self.pp.do_include(self.tempnam)
    self.assertEquals(self.pp.out.getvalue(), 'a\nb\nc\n')
 def __init__(self, outputFormat = 'flat', useJarfileManifest = True,
              useChromeManifest = False):
   self.outputFormat = outputFormat
   self.useJarfileManifest = useJarfileManifest
   self.useChromeManifest = useChromeManifest
   self.pp = Preprocessor()
   self.topsourcedir = None
   self.sourcedirs = []
   self.localedirs = None
   self.l10nbase = None
   self.l10nmerge = None
   self.relativesrcdir = None
   self.rootManifestAppId = None
示例#43
0
    def trainClassifier(self, trainLetter, progress, progLab, maxSets):

        # nacitanie a predspracovanie signalu
        signalLoader = SignalLoader(self.chanNum,self.files)
        prpr = Preprocessor(self.chanNum,[])
        signal,stimCode,phaseInSequence = signalLoader.loadSignal()
        self.signal = prpr.preprocess(240,1E-1,30E0,self.sf,signal,stimCode,phaseInSequence,0)
        self.stimulusCode = prpr.stimulusCode
        self.phaseInSequence = prpr.phaseInSequence
        self.targetLetters = sum(trainLetter,[])

        # najdenie prechodov medzi znakmi
        charEnds = self.findCharEnds()

        # rozdelenie dat do epoch
        em = EpochManager(self.signal,self.stimulusCode,self.phaseInSequence)
        isiList = em.createEpochs()

        # trening jednotlivych znakov
        for i in range(len(charEnds)):
            progress["value"] = i
            progLab["text"] = ("Trénujem znak: {}/{}").format(i+1, len(charEnds))
            print "Averaging character:",i,"\n"
            hi = charEnds[i]
            if i == 0:
                lo = 0
            else:
                lo = charEnds[i-1]

            rowColBinList = em.getAveragedEpochs(hi,lo,isiList,maxSets)
            finalDataArray = rowColBinList
            classMarks = self.prepairTargetArray(self.getCharIndexes(self.targetLetters[i]))

            if self.firsttrain == 1:
                self.cl.learn(finalDataArray,classMarks,0)
                self.firsttrain = 0
            else:
                self.cl.learn(finalDataArray,classMarks)
示例#44
0
def preprocess(input, parser, defines={}):
    '''
    Preprocess the file-like input with the given defines, and send the
    preprocessed output line by line to the given parser.
    '''
    pp = Preprocessor()
    pp.context.update(defines)
    pp.do_filter('substitution')
    pp.out = PreprocessorOutputWrapper(pp, parser)
    pp.do_include(input)
示例#45
0
def make_preprocessor(config_status):
    pp = Preprocessor()
    pp.setLineEndings("lf")
    pp.setMarker("#")
    pp.do_filter("substitution")

    # Might need 'substs' too.
    defines = {}
    for k, v in config_status['defines']:
        if v:
            defines[k] = v
    pp.context.update(defines)

    return pp
示例#46
0
    def getXy(self, path):
        raw = self.getRaw(path)
        docs = self.getDocs(raw['boilerplate'])

        if 'label' in raw:
            y = raw['label']

        docs = self.preprocessDocs(docs)
        #docs = self.expandVocab(docs)
        print "vectorizing..."
        X_text = self.vectorizer.transform(docs)
        X_text = self.tsvd.transform(X_text)

        print 'X Sparse Array Size:'
        print X_text.shape
        self.Pre = Preprocessor()
        X_meta, y, urlid = self.Pre.preprocess(raw)
        #X_meta = np.abs(X_meta)
        #X = hstack([X_meta,X_text])
        X = X_text
        d = {'X': X, 'y':y, 'urlid': urlid}
        return d
示例#47
0
    def _get_preprocessor(self, path, extra):
        '''Returns a preprocessor for use by create_config_file and
        create_makefile.
        '''
        pp = Preprocessor()
        pp.context.update(self.substs)
        pp.context.update(top_srcdir = self.get_top_srcdir(path))
        pp.context.update(srcdir = self.get_file_srcdir(path))
        pp.context.update(relativesrcdir = self.get_relative_srcdir(path))
        pp.context.update(DEPTH = self.get_depth(path))
        if extra:
            pp.context.update(extra)
        pp.do_filter('attemptSubstitution')
        pp.setMarker(None)

        pp.out = FileAvoidWrite(path)
        return pp
示例#48
0
class JarMaker(object):
  '''JarMaker reads jar.mn files and process those into jar files or
  flat directories, along with chrome.manifest files.
  '''

  ignore = re.compile('\s*(\#.*)?$')
  jarline = re.compile('(?:(?P<jarfile>[\w\d.\-\_\\\/]+).jar\:)|(?:\s*(\#.*)?)\s*$')
  regline = re.compile('\%\s+(.*)$')
  entryre = '(?P<optPreprocess>\*)?(?P<optOverwrite>\+?)\s+'
  entryline = re.compile(entryre + '(?P<output>[\w\d.\-\_\\\/\+]+)\s*(\((?P<locale>\%?)(?P<source>[\w\d.\-\_\\\/]+)\))?\s*$')

  def __init__(self, outputFormat = 'flat', useJarfileManifest = True,
               useChromeManifest = False):
    self.outputFormat = outputFormat
    self.useJarfileManifest = useJarfileManifest
    self.useChromeManifest = useChromeManifest
    self.pp = Preprocessor()

  def getCommandLineParser(self):
    '''Get a optparse.OptionParser for jarmaker.

    This OptionParser has the options for jarmaker as well as
    the options for the inner PreProcessor.
    '''
    # HACK, we need to unescape the string variables we get,
    # the perl versions didn't grok strings right
    p = self.pp.getCommandLineParser(unescapeDefines = True)
    p.add_option('-f', type="choice", default="jar",
                 choices=('jar', 'flat', 'symlink'),
                 help="fileformat used for output", metavar="[jar, flat, symlink]")
    p.add_option('-v', action="store_true", dest="verbose",
                 help="verbose output")
    p.add_option('-q', action="store_false", dest="verbose",
                 help="verbose output")
    p.add_option('-e', action="store_true",
                 help="create chrome.manifest instead of jarfile.manifest")
    p.add_option('--both-manifests', action="store_true",
                 dest="bothManifests",
                 help="create chrome.manifest and jarfile.manifest")
    p.add_option('-s', type="string", action="append", default=[],
                 help="source directory")
    p.add_option('-t', type="string",
                 help="top source directory")
    p.add_option('-c', '--l10n-src', type="string", action="append",
                 help="localization directory")
    p.add_option('--l10n-base', type="string", action="append", default=[],
                 help="base directory to be used for localization (multiple)")
    p.add_option('-j', type="string",
                 help="jarfile directory")
    # backwards compat, not needed
    p.add_option('-a', action="store_false", default=True,
                 help="NOT SUPPORTED, turn auto-registration of chrome off (installed-chrome.txt)")
    p.add_option('-d', type="string",
                 help="UNUSED, chrome directory")
    p.add_option('-o', help="cross compile for auto-registration, ignored")
    p.add_option('-l', action="store_true",
                 help="ignored (used to switch off locks)")
    p.add_option('-x', action="store_true",
                 help="force Unix")
    p.add_option('-z', help="backwards compat, ignored")
    p.add_option('-p', help="backwards compat, ignored")
    return p

  def processIncludes(self, includes):
    '''Process given includes with the inner PreProcessor.

    Only use this for #defines, the includes shouldn't generate
    content.
    '''
    self.pp.out = StringIO()
    for inc in includes:
      self.pp.do_include(inc)
    includesvalue = self.pp.out.getvalue()
    if includesvalue:
      logging.info("WARNING: Includes produce non-empty output")
    self.pp.out = None
    pass

  def finalizeJar(self, jarPath, chromebasepath, register,
                  doZip=True):
    '''Helper method to write out the chrome registration entries to
    jarfile.manifest or chrome.manifest, or both.

    The actual file processing is done in updateManifest.
    '''
    # rewrite the manifest, if entries given
    if not register:
      return

    chromeManifest = os.path.join(os.path.dirname(jarPath),
                                  '..', 'chrome.manifest')

    if self.useJarfileManifest:
      self.updateManifest(jarPath + '.manifest', chromebasepath % '',
                          register)
      addEntriesToListFile(chromeManifest, ['manifest chrome/%s.manifest' % (os.path.basename(jarPath),)])
    if self.useChromeManifest:
      self.updateManifest(chromeManifest, chromebasepath % 'chrome/',
                          register)

  def updateManifest(self, manifestPath, chromebasepath, register):
    '''updateManifest replaces the % in the chrome registration entries
    with the given chrome base path, and updates the given manifest file.
    '''
    lock = lockFile(manifestPath + '.lck')
    try:
      myregister = dict.fromkeys(map(lambda s: s.replace('%', chromebasepath),
                                     register.iterkeys()))
      manifestExists = os.path.isfile(manifestPath)
      mode = (manifestExists and 'r+b') or 'wb'
      mf = open(manifestPath, mode)
      if manifestExists:
        # import previous content into hash, ignoring empty ones and comments
        imf = re.compile('(#.*)?$')
        for l in re.split('[\r\n]+', mf.read()):
          if imf.match(l):
            continue
          myregister[l] = None
        mf.seek(0)
      for k in myregister.iterkeys():
        mf.write(k + os.linesep)
      mf.close()
    finally:
      lock = None
  
  def makeJar(self, infile=None,
               jardir='',
               sourcedirs=[], topsourcedir='', localedirs=None):
    '''makeJar is the main entry point to JarMaker.

    It takes the input file, the output directory, the source dirs and the
    top source dir as argument, and optionally the l10n dirs.
    '''
    if isinstance(infile, basestring):
      logging.info("processing " + infile)
    pp = self.pp.clone()
    pp.out = StringIO()
    pp.do_include(infile)
    lines = pushback_iter(pp.out.getvalue().splitlines())
    try:
      while True:
        l = lines.next()
        m = self.jarline.match(l)
        if not m:
          raise RuntimeError(l)
        if m.group('jarfile') is None:
          # comment
          continue
        self.processJarSection(m.group('jarfile'), lines,
                               jardir, sourcedirs, topsourcedir,
                               localedirs)
    except StopIteration:
      # we read the file
      pass
    return

  def makeJars(self, infiles, l10nbases,
               jardir='',
               sourcedirs=[], topsourcedir='', localedirs=None):
    '''makeJars is the second main entry point to JarMaker.

    It takes an iterable sequence of input file names, the l10nbases,
    the output directory, the source dirs and the
    top source dir as argument, and optionally the l10n dirs.

    It iterates over all inputs, guesses srcdir and l10ndir from the
    path and topsourcedir and calls into makeJar.

    The l10ndirs are created by guessing the relativesrcdir, and resolving
    that against the l10nbases. l10nbases can either be path strings, or 
    callables. In the latter case, that will be called with the 
    relativesrcdir as argument, and is expected to return a path string.
    This logic is disabled if the jar.mn path is not inside the topsrcdir.
    '''
    topsourcedir = os.path.normpath(os.path.abspath(topsourcedir))
    def resolveL10nBase(relpath):
      def _resolve(base):
        if isinstance(base, basestring):
          return os.path.join(base, relpath)
        if callable(base):
          return base(relpath)
        return base
      return _resolve
    for infile in infiles:
      srcdir = os.path.normpath(os.path.abspath(os.path.dirname(infile)))
      l10ndir = srcdir
      if os.path.basename(srcdir) == 'locales':
        l10ndir = os.path.dirname(l10ndir)

      l10ndirs = None
      # srcdir may not be a child of topsourcedir, in which case
      # we assume that the caller passed in suitable sourcedirs,
      # and just skip passing in localedirs
      if srcdir.startswith(topsourcedir):
        rell10ndir = l10ndir[len(topsourcedir):].lstrip(os.sep)

        l10ndirs = map(resolveL10nBase(rell10ndir), l10nbases)
        if localedirs is not None:
          l10ndirs += [os.path.normpath(os.path.abspath(s))
                       for s in localedirs]
      srcdirs = [os.path.normpath(os.path.abspath(s))
                 for s in sourcedirs] + [srcdir]
      self.makeJar(infile=infile,
                   sourcedirs=srcdirs, topsourcedir=topsourcedir,
                   localedirs=l10ndirs,
                   jardir=jardir)


  def processJarSection(self, jarfile, lines,
                        jardir, sourcedirs, topsourcedir, localedirs):
    '''Internal method called by makeJar to actually process a section
    of a jar.mn file.

    jarfile is the basename of the jarfile or the directory name for 
    flat output, lines is a pushback_iterator of the lines of jar.mn,
    the remaining options are carried over from makeJar.
    '''

    # chromebasepath is used for chrome registration manifests
    # %s is getting replaced with chrome/ for chrome.manifest, and with
    # an empty string for jarfile.manifest
    chromebasepath = '%s' + os.path.basename(jarfile)
    if self.outputFormat == 'jar':
      chromebasepath = 'jar:' + chromebasepath + '.jar!'
    chromebasepath += '/'

    jarfile = os.path.join(jardir, jarfile)
    jf = None
    if self.outputFormat == 'jar':
      #jar
      jarfilepath = jarfile + '.jar'
      try:
        os.makedirs(os.path.dirname(jarfilepath))
      except OSError, error:
        if error.errno != errno.EEXIST:
          raise
      jf = ZipFile(jarfilepath, 'a', lock = True)
      outHelper = self.OutputHelper_jar(jf)
    else:
示例#49
0
	def setUp(self):
		cur_dir = os.path.dirname(os.path.realpath(__file__))
		self.tpl_dir = os.path.join(cur_dir,'..','UnitTests','test_templates')
		self.preprocessor = Preprocessor(self.tpl_dir)
示例#50
0
    def guessChars(self,subset,files,targetLetter,testProgress,progTestLabel,guessView,guessLab,maxSets):
        aktCharNum = 0
        totalChars = len(sum(targetLetter,[]))

        if self.chanNum != 64:
            files.sort()
            files = self.createTriplets(files)


        for m in range(len(files)):
            # nacitanie a predspracovanie signalu
            signalLoader = SignalLoader(self.chanNum,files[m])
            prpr = Preprocessor(self.chanNum,subset)
            signal, stimCode, phaseInSequence = signalLoader.loadSignal()
            self.signal = prpr.preprocess(240,1E-1,30E0,self.sf,signal,stimCode,phaseInSequence,1)
            self.stimulusCode = prpr.stimulusCode
            self.phaseInSequence = prpr.phaseInSequence
            if (len(targetLetter) > m):
                self.targetLetters = targetLetter[m]
            else:
                self.targetLetters = []
            print "Processing file:",m,"\n"

            # najdenie prechodov medzi znakmi
            charEnds = self.findCharEnds()

            # rozdelenie dat do epoch
            em = EpochManager(self.signal,self.stimulusCode,self.phaseInSequence)
            isiList = em.createEpochs()

            hit = 0
            # hadanie jednotlivych znakov
            for i in range(len(charEnds)):
                testProgress["value"] = aktCharNum
                progTestLabel["text"] = ("Hádam znak: {}/{}").format(aktCharNum+1, totalChars)
                aktCharNum +=1

                hi = charEnds[i]
                if i == 0:
                    lo = 0
                else:
                    lo = charEnds[i-1]

                rowColBinList = em.getAveragedEpochs(hi,lo,isiList,maxSets)
                finalDataArray = self.prepairSignalArray(self.sf.grandAveragingFilter(rowColBinList,subset,1))

                #pomocou klasifikatora
                char = self.cl.predictTarget(finalDataArray,self.cl.reduce(self.sf,self,subset))

                if len(self.targetLetters) > i:
                    if char == self.targetLetters[i]:
                        hit+=1
                        print "Succesfully guessed char:",char,"\n"
                    else:
                        print "Guessed char:",char,"\n"


                if i == 0:
                    text = "(" + char + ","
                elif i == len(charEnds) - 1:
                    text = char + ")"
                else:
                    text = char + ","

                guessView.configure(state='normal')
                guessView.insert(INSERT, text)
                guessView.configure(state='disabled')

            self.rate += (hit)*100/float(totalChars)
            print "\n Success rate= ",self.rate, "\n"
            guessLab["text"]=("Presnosť: {}").format(self.rate)

        return self.rate
示例#51
0
class JarMaker(object):
  '''JarMaker reads jar.mn files and process those into jar files or
  flat directories, along with chrome.manifest files.
  '''

  ignore = re.compile('\s*(\#.*)?$')
  jarline = re.compile('(?:(?P<jarfile>[\w\d.\-\_\\\/]+).jar\:)|(?:\s*(\#.*)?)\s*$')
  relsrcline = re.compile('relativesrcdir\s+(?P<relativesrcdir>.+?):')
  regline = re.compile('\%\s+(.*)$')
  entryre = '(?P<optPreprocess>\*)?(?P<optOverwrite>\+?)\s+'
  entryline = re.compile(entryre + '(?P<output>[\w\d.\-\_\\\/\+\@]+)\s*(\((?P<locale>\%?)(?P<source>[\w\d.\-\_\\\/\@]+)\))?\s*$')

  def __init__(self, outputFormat = 'flat', useJarfileManifest = True,
               useChromeManifest = False):
    self.outputFormat = outputFormat
    self.useJarfileManifest = useJarfileManifest
    self.useChromeManifest = useChromeManifest
    self.pp = Preprocessor()
    self.topsourcedir = None
    self.sourcedirs = []
    self.localedirs = None
    self.l10nbase = None
    self.l10nmerge = None
    self.relativesrcdir = None
    self.rootManifestAppId = None

  def getCommandLineParser(self):
    '''Get a optparse.OptionParser for jarmaker.

    This OptionParser has the options for jarmaker as well as
    the options for the inner PreProcessor.
    '''
    # HACK, we need to unescape the string variables we get,
    # the perl versions didn't grok strings right
    p = self.pp.getCommandLineParser(unescapeDefines = True)
    p.add_option('-f', type="choice", default="jar",
                 choices=('jar', 'flat', 'symlink'),
                 help="fileformat used for output", metavar="[jar, flat, symlink]")
    p.add_option('-v', action="store_true", dest="verbose",
                 help="verbose output")
    p.add_option('-q', action="store_false", dest="verbose",
                 help="verbose output")
    p.add_option('-e', action="store_true",
                 help="create chrome.manifest instead of jarfile.manifest")
    p.add_option('--both-manifests', action="store_true",
                 dest="bothManifests",
                 help="create chrome.manifest and jarfile.manifest")
    p.add_option('-s', type="string", action="append", default=[],
                 help="source directory")
    p.add_option('-t', type="string",
                 help="top source directory")
    p.add_option('-c', '--l10n-src', type="string", action="append",
                 help="localization directory")
    p.add_option('--l10n-base', type="string", action="store",
                 help="base directory to be used for localization (requires relativesrcdir)")
    p.add_option('--locale-mergedir', type="string", action="store",
                 help="base directory to be used for l10n-merge (requires l10n-base and relativesrcdir)")
    p.add_option('--relativesrcdir', type="string",
                 help="relativesrcdir to be used for localization")
    p.add_option('-j', type="string",
                 help="jarfile directory")
    p.add_option('--root-manifest-entry-appid', type="string",
                 help="add an app id specific root chrome manifest entry.")
    return p

  def processIncludes(self, includes):
    '''Process given includes with the inner PreProcessor.

    Only use this for #defines, the includes shouldn't generate
    content.
    '''
    self.pp.out = StringIO()
    for inc in includes:
      self.pp.do_include(inc)
    includesvalue = self.pp.out.getvalue()
    if includesvalue:
      logging.info("WARNING: Includes produce non-empty output")
    self.pp.out = None
    pass

  def finalizeJar(self, jarPath, chromebasepath, register,
                  doZip=True):
    '''Helper method to write out the chrome registration entries to
    jarfile.manifest or chrome.manifest, or both.

    The actual file processing is done in updateManifest.
    '''
    # rewrite the manifest, if entries given
    if not register:
      return

    chromeManifest = os.path.join(os.path.dirname(jarPath),
                                  '..', 'chrome.manifest')

    if self.useJarfileManifest:
      self.updateManifest(jarPath + '.manifest', chromebasepath.format(''),
                          register)
      addEntriesToListFile(chromeManifest, ['manifest chrome/{0}.manifest'
                                            .format(os.path.basename(jarPath))])
    if self.useChromeManifest:
      self.updateManifest(chromeManifest, chromebasepath.format('chrome/'),
                          register)

    # If requested, add a root chrome manifest entry (assumed to be in the parent directory
    # of chromeManifest) with the application specific id. In cases where we're building
    # lang packs, the root manifest must know about application sub directories.
    if self.rootManifestAppId:
      rootChromeManifest = os.path.join(os.path.normpath(os.path.dirname(chromeManifest)),
                                        '..', 'chrome.manifest')
      rootChromeManifest = os.path.normpath(rootChromeManifest)
      chromeDir = os.path.basename(os.path.dirname(os.path.normpath(chromeManifest)))
      logging.info("adding '%s' entry to root chrome manifest appid=%s" % (chromeDir, self.rootManifestAppId))
      addEntriesToListFile(rootChromeManifest, ['manifest %s/chrome.manifest application=%s' % (chromeDir, self.rootManifestAppId)])

  def updateManifest(self, manifestPath, chromebasepath, register):
    '''updateManifest replaces the % in the chrome registration entries
    with the given chrome base path, and updates the given manifest file.
    '''
    lock = lockFile(manifestPath + '.lck')
    try:
      myregister = dict.fromkeys(map(lambda s: s.replace('%', chromebasepath),
                                     register.iterkeys()))
      manifestExists = os.path.isfile(manifestPath)
      mode = (manifestExists and 'r+b') or 'wb'
      mf = open(manifestPath, mode)
      if manifestExists:
        # import previous content into hash, ignoring empty ones and comments
        imf = re.compile('(#.*)?$')
        for l in re.split('[\r\n]+', mf.read()):
          if imf.match(l):
            continue
          myregister[l] = None
        mf.seek(0)
      for k in myregister.iterkeys():
        mf.write(k + os.linesep)
      mf.close()
    finally:
      lock = None

  def makeJar(self, infile, jardir):
    '''makeJar is the main entry point to JarMaker.

    It takes the input file, the output directory, the source dirs and the
    top source dir as argument, and optionally the l10n dirs.
    '''
    # making paths absolute, guess srcdir if file and add to sourcedirs
    _normpath = lambda p: os.path.normpath(os.path.abspath(p))
    self.topsourcedir = _normpath(self.topsourcedir)
    self.sourcedirs = [_normpath(p) for p in self.sourcedirs]
    if self.localedirs:
      self.localedirs = [_normpath(p) for p in self.localedirs]
    elif self.relativesrcdir:
      self.localedirs = self.generateLocaleDirs(self.relativesrcdir)
    if isinstance(infile, basestring):
      logging.info("processing " + infile)
      self.sourcedirs.append(_normpath(os.path.dirname(infile)))
    pp = self.pp.clone()
    pp.out = StringIO()
    pp.do_include(infile)
    lines = pushback_iter(pp.out.getvalue().splitlines())
    try:
      while True:
        l = lines.next()
        m = self.jarline.match(l)
        if not m:
          raise RuntimeError(l)
        if m.group('jarfile') is None:
          # comment
          continue
        self.processJarSection(m.group('jarfile'), lines, jardir)
    except StopIteration:
      # we read the file
      pass
    return

  def generateLocaleDirs(self, relativesrcdir):
    if os.path.basename(relativesrcdir) == 'locales':
      # strip locales
      l10nrelsrcdir = os.path.dirname(relativesrcdir)
    else:
      l10nrelsrcdir = relativesrcdir
    locdirs = []
    # generate locales dirs, merge, l10nbase, en-US
    if self.l10nmerge:
      locdirs.append(os.path.join(self.l10nmerge, l10nrelsrcdir))
    if self.l10nbase:
      locdirs.append(os.path.join(self.l10nbase, l10nrelsrcdir))
    if self.l10nmerge or not self.l10nbase:
      # add en-US if we merge, or if it's not l10n
      locdirs.append(os.path.join(self.topsourcedir, relativesrcdir, 'en-US'))
    return locdirs

  def processJarSection(self, jarfile, lines, jardir):
    '''Internal method called by makeJar to actually process a section
    of a jar.mn file.

    jarfile is the basename of the jarfile or the directory name for 
    flat output, lines is a pushback_iterator of the lines of jar.mn,
    the remaining options are carried over from makeJar.
    '''

    # chromebasepath is used for chrome registration manifests
    # {0} is getting replaced with chrome/ for chrome.manifest, and with
    # an empty string for jarfile.manifest
    chromebasepath = '{0}' + os.path.basename(jarfile)
    if self.outputFormat == 'jar':
      chromebasepath = 'jar:' + chromebasepath + '.jar!'
    chromebasepath += '/'

    jarfile = os.path.join(jardir, jarfile)
    jf = None
    if self.outputFormat == 'jar':
      #jar
      jarfilepath = jarfile + '.jar'
      try:
        os.makedirs(os.path.dirname(jarfilepath))
      except OSError as error:
        if error.errno != errno.EEXIST:
          raise
      jf = ZipFile(jarfilepath, 'a', lock = True)
      outHelper = self.OutputHelper_jar(jf)
    else:
      outHelper = getattr(self, 'OutputHelper_' + self.outputFormat)(jarfile)
    register = {}
    # This loop exits on either
    # - the end of the jar.mn file
    # - an line in the jar.mn file that's not part of a jar section
    # - on an exception raised, close the jf in that case in a finally
    try:
      while True:
        try:
          l = lines.next()
        except StopIteration:
          # we're done with this jar.mn, and this jar section
          self.finalizeJar(jarfile, chromebasepath, register)
          if jf is not None:
            jf.close()
          # reraise the StopIteration for makeJar
          raise
        if self.ignore.match(l):
          continue
        m = self.relsrcline.match(l)
        if m:
          relativesrcdir = m.group('relativesrcdir')
          self.localedirs = self.generateLocaleDirs(relativesrcdir)
          continue
        m = self.regline.match(l)
        if  m:
          rline = m.group(1)
          register[rline] = 1
          continue
        m = self.entryline.match(l)
        if not m:
          # neither an entry line nor chrome reg, this jar section is done
          self.finalizeJar(jarfile, chromebasepath, register)
          if jf is not None:
            jf.close()
          lines.pushback(l)
          return
        self._processEntryLine(m, outHelper, jf)
    finally:
      if jf is not None:
        jf.close()
    return

  def _processEntryLine(self, m, outHelper, jf):
      out = m.group('output')
      src = m.group('source') or os.path.basename(out)
      # pick the right sourcedir -- l10n, topsrc or src
      if m.group('locale'):
        src_base = self.localedirs
      elif src.startswith('/'):
        # path/in/jar/file_name.xul     (/path/in/sourcetree/file_name.xul)
        # refers to a path relative to topsourcedir, use that as base
        # and strip the leading '/'
        src_base = [self.topsourcedir]
        src = src[1:]
      else:
        # use srcdirs and the objdir (current working dir) for relative paths
        src_base = self.sourcedirs + [os.getcwd()]
      # check if the source file exists
      realsrc = None
      for _srcdir in src_base:
        if os.path.isfile(os.path.join(_srcdir, src)):
          realsrc = os.path.join(_srcdir, src)
          break
      if realsrc is None:
        if jf is not None:
          jf.close()
        raise RuntimeError('File "{0}" not found in {1}'
                           .format(src, ', '.join(src_base)))
      if m.group('optPreprocess'):
        outf = outHelper.getOutput(out)
        inf = open(realsrc)
        pp = self.pp.clone()
        if src[-4:] == '.css':
          pp.setMarker('%')
        pp.out = outf
        pp.do_include(inf)
        pp.warnUnused(realsrc)
        outf.close()
        inf.close()
        return
      # copy or symlink if newer or overwrite
      if (m.group('optOverwrite')
          or (getModTime(realsrc) >
              outHelper.getDestModTime(m.group('output')))):
        if self.outputFormat == 'symlink':
          outHelper.symlink(realsrc, out)
          return
        outf = outHelper.getOutput(out)
        # open in binary mode, this can be images etc
        inf = open(realsrc, 'rb')
        outf.write(inf.read())
        outf.close()
        inf.close()


  class OutputHelper_jar(object):
    '''Provide getDestModTime and getOutput for a given jarfile.
    '''
    def __init__(self, jarfile):
      self.jarfile = jarfile
    def getDestModTime(self, aPath):
      try :
        info = self.jarfile.getinfo(aPath)
        return info.date_time
      except:
        return 0
    def getOutput(self, name):
      return ZipEntry(name, self.jarfile)

  class OutputHelper_flat(object):
    '''Provide getDestModTime and getOutput for a given flat
    output directory. The helper method ensureDirFor is used by
    the symlink subclass.
    '''
    def __init__(self, basepath):
      self.basepath = basepath
    def getDestModTime(self, aPath):
      return getModTime(os.path.join(self.basepath, aPath))
    def getOutput(self, name):
      out = self.ensureDirFor(name)
      # remove previous link or file
      try:
        os.remove(out)
      except OSError as e:
        if e.errno != errno.ENOENT:
          raise
      return open(out, 'wb')
    def ensureDirFor(self, name):
      out = os.path.join(self.basepath, name)
      outdir = os.path.dirname(out)
      if not os.path.isdir(outdir):
        try:
          os.makedirs(outdir)
        except OSError as error:
          if error.errno != errno.EEXIST:
            raise
      return out

  class OutputHelper_symlink(OutputHelper_flat):
    '''Subclass of OutputHelper_flat that provides a helper for
    creating a symlink including creating the parent directories.
    '''
    def symlink(self, src, dest):
      out = self.ensureDirFor(dest)
      # remove previous link or file
      try:
        os.remove(out)
      except OSError as e:
        if e.errno != errno.ENOENT:
          raise
      if sys.platform != "win32":
        os.symlink(src, out)
      else:
        # On Win32, use ctypes to create a hardlink
        rv = CreateHardLink(out, src, None)
        if rv == 0:
          raise WinError()
示例#52
0
class TopicModelHarness:
    def __init__(self, getTitle, getBody, getUrl):
        self.getTitle = getTitle
        self.getBody = getBody
        self.getUrl = getUrl

    def getRaw(self, path):
        raw = read_csv(path, sep='\t', na_values=['?']).fillna(-5)
        return raw

    def getColumns(self, raw):
        boilerplate = raw['boilerplate']
        urlid = raw['urlid']
        if 'label' in raw:
            labels = raw['label']
            return boilerplate, labels, urlid
        return boilerplate, urlid

    def getDocs(self, boilerplate):
        docs = []
        for row in boilerplate:
            rowObject = json.loads(row)
            doc = ''
            if 'title' in rowObject and rowObject['title'] and self.getTitle:
                doc += rowObject['title']
            if 'body' in rowObject and rowObject['body'] and self.getBody:
                doc += ' ' + rowObject['body']
            if 'url' in rowObject and rowObject['url'] and self.getUrl:
                doc += ' ' + rowObject['url']
            docs.append(doc)
        return docs

    def tag(self, str, tag):
        strList = str.split(' ')
        newstr = ''
        for s in strList:
            if s.lower() not in ENGLISH_STOP_WORDS:
                newstr += tag + '__' + s + ' ' + s + ' '
        return newstr

    def preprocessDocs(self, docs):
        preprocessed_docs = []
        for doc in docs:
            punctuation = [',','.', ';', '!', '?', ':']
            for p in punctuation:
                doc = doc.replace(p, ' ' + p + ' ')

            doc = doc.lower()
            preprocessed_docs.append(doc)

        return preprocessed_docs

    def expandVocab(self, docs):
        print 'expanding vocabulary...'
        freqCounts = self.countTokens(docs)

        tokenList = []
        freqCountList = []
        for token in freqCounts:
            tokenList.append(token)
            freqCountList.append(freqCounts[token])

        expTokenDf = DataFrame({'tokens': tokenList, 'freqCounts': freqCountList})
        expTokenDf = expTokenDf.sort('freqCounts', ascending=False)
        expandableTokensFiltered = set(expTokenDf['tokens'][2000:3000]).difference(ENGLISH_STOP_WORDS)
        batchSize = 10000
        print "%d filtered tokens chosen" % len(expandableTokensFiltered)
        print "Expandable tokens: "
        print expandableTokensFiltered
        newDocs = []
        for i in xrange(0,len(docs)):
            doc = docs[i]
            newDocSplit = doc.split()
            tokenList = doc.split(' ')
            start = 0
            newTokens = set()
            while start < len(tokenList):
                stop = start + batchSize
                tokens = set(tokenList[start:stop])
                start = start + batchSize/2
                tokensToExpand = tokens.intersection(expandableTokensFiltered)
                newTokens = newTokens.union(self.expandVocabFromSet(tokensToExpand))

            newDocSplit.extend(list(newTokens))
            newDoc = ''
            for token in newDocSplit:
                newDoc += ' ' + token + ' '
            newDocs.append(newDoc)

            if i % 500 == 0:
                print '\nprocessed %d docs' % i
                print '%d new tokens added to document' % len(newTokens)
                print 'new tokens:'
                print newTokens
                print len(tokens)

        return newDocs

    def expandVocabFromSet(self, tokensToExpand):
        expanded = set()
        for token1 in tokensToExpand:
            for token2 in tokensToExpand:
                if token1 != token2:
                    hash = self.getTwoTokenHash(token1, token2)
                    if hash not in expanded:
                        expanded.add(hash)
        return expanded

    def getTwoTokenHash(self, token1, token2):
        l = [token1, token2]
        l.sort()
        hash = l[0] + '___' + l[1]
        return hash

    def replaceRareWords(self, docs, rareWords):
        processed_docs = []
        for doc in docs:
            punctuation = [',','.', ';', '!', '?', ':']
            for p in punctuation:
                doc = doc.replace(p, ' ' + p + ' ')

            docList = [(self.classifyRareWord(d) if (d in rareWords) else d) for d in doc.split(' ')]
            doc = ''

            for d in docList:
                doc += ' %s ' % d

            processed_docs.append(doc)

        return processed_docs

    def removeStopWords(self, docs, stopWords):
        processed_docs = []
        for doc in docs:
            punctuation = [',','.', ';', '!', '?', ':']
            for p in punctuation:
                doc = doc.replace(p, ' ' + p + ' ')

            docList = [('' if (d in stopWords) else d) for d in doc.split(' ')]
            doc = ''

            for d in docList:
                doc += ' %s ' % d

            processed_docs.append(doc)

        return processed_docs

    def getStopWords(self, freqs, threshold):
        stopWords = set()
        for token in freqs:
            if freqs[token] > threshold:
                stopWords.add(token)
        return stopWords

    def classifyRareWord(self, word):
        if word.find('-') >= 0:
            words = word.split('-')
            out = ''
            for w in words:
                out += w + ' '
            return out
        if word.isdigit():
            return '__ISDIGIT__'
        return '__RARE__'

    def countTokens(self, docs):
        freqCounts = {}
        #for doc in docs, yi in y:
        for i in xrange(0,len(docs)):
            doc = docs[i]
            tokenList = doc.split(' ')
            for token in tokenList:
                if token in freqCounts:
                    freqCounts[token] += 1
                else:
                    freqCounts[token] = 1
        return freqCounts

    def getAmbiguousTokens(self, freqsByClass):
        ambiguousTokens = set()
        for token in freqsByClass:
            if np.abs(freqsByClass[token]) < 0.1:
                ambiguousTokens.add(token)
        return ambiguousTokens


    def getFreqs(self, freqCounts):
        # get total token count
        totalTokenCount = 0
        for token in freqCounts:
            totalTokenCount += freqCounts[token]

        freqs = {}
        for token in freqCounts:
            freqs[token] = float(freqCounts[token]) / float(totalTokenCount)
        return freqs

    def getRareWords(self, freqCounts):
        #freqCounts = self.countTokens(docs)
        rareWords = set()
        for token in freqCounts:
            if freqCounts[token] <= 1:
                rareWords.add(token)
        return rareWords

    def vectorize(self, docs, stopWords, fit=False):
        print "vectorizing..."
        #vectorizer = HashingVectorizer(stop_words='english', non_negative=True)
        #vectorizer = HashingVectorizer(stop_words=stopWords, non_negative=True, norm='l2')


        vectorizedDocs = self.vectorizer.transform(docs)

        #print vectorizedDocs
        return vectorizedDocs

    def standardizeVecs(self, vectorizedDocs):
        print "standardizing vectors..."
        s = vectorizedDocs
        #s_lil = vectorizedDocs.tolil()

        """
        col_sum = s.sum(axis=0)
        (rows, cols) = s.nonzero()
        s_normalized = lil_matrix(s.shape, dtype='float64')
        for i in xrange(0,len(rows)):
            s_normalized[rows[i], cols[i]] = s[rows[i], cols[i]] / col_sum[0, cols[i]]
            if i%50000==0:
                print i
        return s_normalized.tocsr()
        """
        # standardize
        #means = s.mean(axis=0)

        # initialize mean matrix
        #mean_lil = lil_matrix(s.shape, dtype='float64')

        (rows, cols)  = s.nonzero()
        #for i in xrange(0, len(rows)):
        #    mean_lil[rows[i], cols[i]] = means[0, cols[i]]
        #mean_csr = mean_lil.tocsr()
        #s_zeroMean = (s - mean_csr)
        #s_stdDev = (s_zeroMean.multiply(s_zeroMean)).mean(axis=0)
        norm = (s.multiply(s)).sum(axis=0)
        s_standardized = lil_matrix(s.shape, dtype='float64')
        #print s_stdDev.shape
        print s_standardized.shape
        #print s_zeroMean.shape
        for i in xrange(0,len(rows)):
            #s_standardized[rows[i], cols[i]] = s_zeroMean[rows[i], cols[i]] / s_stdDev[0, cols[i]]
            s_standardized[rows[i], cols[i]] = float(np.abs(s[rows[i], cols[i]])) / np.sqrt(float(norm[0, cols[i]]))
            if i%50000==0:
                print i
        for i in xrange(0,50):
            print s_standardized[i, 0]

        return s_standardized.tocsr()

    def trainFromDataFrame(self, df):
        print "training from data frame..."
        boilerplate, y = self.getColumns(df)
        docs = self.getDocs(boilerplate)
        #rareWords = self.getRareWords(docs)
        docs = self.preprocessDocs(docs)
        X = self.vectorize(docs)
        self.fit(X, y)


    def fit(self, X, y):
        print 'training topic model...'
        #self.model = TopicModel()
        #self.model = LogisticRegression(penalty='l2', dual=True, C=.8)
        #self.model.fit(X,y)

        #params = {'C': linspace(.3, .8, 1), 'numFeatures': linspace(1000, 15000, 5)}
        #params = {'C': linspace(.5, 1, 2), 'numFeatures': linspace(25000, 75000, 2)}
        #print params
        #params = {'C': logspace(-1,4,10), 'gamma':logspace(0,0,1)}
        params = {'C': linspace(.2,2,10)}
        clf = TopicModel()
        self.model = GridSearchCV(clf, param_grid=params, scoring='roc_auc', cv=5, verbose=2, n_jobs=4)
        self.model.fit(X, y)

        try:
            print 'Best Params:'
            print self.model.best_params_
            print 'Best Score: '
            print self.model.best_score_
            print self.model.grid_scores_
        except:
            pass

        self.model = self.model.best_estimator_

    def addTopicModel(self, boilerplate):
        docs = self.getDocs(boilerplate)
        X_extracted = self.vectorize(docs)
        #y_predicted = self.predict(X_extracted)
        y_predicted = self.binResults(self.predict(X_extracted), .05)
        return y_predicted

    def addTotalWordCounts(self, boilerplate):
        docs = self.getDocs(boilerplate)
        wordCounts = [(len(doc.split(' '))) for doc in docs]
        return wordCounts


    def predict(self, X):
        return self.model.predict(X)
        #return self.model.predict_proba(X)[:,1]


    def addAlchemyCategories(self, docs, alchemyCategory):
        for i in xrange(0, len(docs)):
            docs[i] = docs[i] + ' __' + str(alchemyCategory[i]) + ' '
        return docs

    def trainVectorizer(self):
        print "Training vectorizer..."
        raw = self.getRaw('train.tsv')
        rawTest = self.getRaw('test.tsv')
        boilerplate = list(raw['boilerplate'])
        boilerplate.extend(list(rawTest['boilerplate']))
        docs = self.getDocs(boilerplate)
        docs = self.preprocessDocs(docs)
        #docs = self.expandVocab(docs)
        print 'all docs length: %d' % len(docs)

        self.vectorizer =TfidfVectorizer(min_df=3,  max_features=None, strip_accents='unicode',
                        analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1, 2), use_idf=1,smooth_idf=1,
                        sublinear_tf=1)
        self.vectorizer.fit(docs)
        print "vectorizing..."
        X = self.vectorizer.transform(docs)
        print "finding principal components..."
        self.tsvd = TruncatedSVD(n_components = 500)
        self.tsvd.fit(X)

    def getXy(self, path):
        raw = self.getRaw(path)
        docs = self.getDocs(raw['boilerplate'])

        if 'label' in raw:
            y = raw['label']

        docs = self.preprocessDocs(docs)
        #docs = self.expandVocab(docs)
        print "vectorizing..."
        X_text = self.vectorizer.transform(docs)
        X_text = self.tsvd.transform(X_text)

        print 'X Sparse Array Size:'
        print X_text.shape
        self.Pre = Preprocessor()
        X_meta, y, urlid = self.Pre.preprocess(raw)
        #X_meta = np.abs(X_meta)
        #X = hstack([X_meta,X_text])
        X = X_text
        d = {'X': X, 'y':y, 'urlid': urlid}
        return d

    def runModel(self, testSize, debug):
        self.trainVectorizer()
        d = self.getXy('train.tsv')
        if debug:
            X_train, X_test, y_train, y_test = train_test_split(d['X'], d['y'], test_size=testSize, random_state=5)
        else:
            X_train = d['X']
            y_train = d['y']
            d_test = self.getXy('test.tsv')
            X_test = d_test['X']
            urlid = d_test['urlid']

        self.fit(X_train, y_train)
        print "20 Fold CV Score: ", np.mean(cross_val_score(self.model, d['X'], d['y'], cv=10, scoring='roc_auc'))
        y_predicted = self.predict(X_test)

        if debug:
            print 'Topic Model AUC Score: %f' % roc_auc_score(y_test, y_predicted)
        else:
            Pre = Preprocessor()
            Pre.generateSubmission('submission_12.csv', urlid, y_predicted)

        P.figure()
        P.hist(y_predicted, bins=100)
        P.show()
 def setUp(self):
   self.pp = Preprocessor()
   self.pp.out = StringIO()
class TestPreprocessor(unittest.TestCase):
  """
  Unit tests for the Context class
  """

  def setUp(self):
    self.pp = Preprocessor()
    self.pp.out = StringIO()

  def test_conditional_if_0(self):
    f = NamedIO("conditional_if_0.in", """#if 0
FAIL
#else
PASS
#endif
""")
    self.pp.do_include(f)
    self.assertEqual(self.pp.out.getvalue(), "PASS\n")

  def test_string_value(self):
    f = NamedIO("string_value.in", """#define FOO STRING
#if FOO
string value is true
#else
string value is false
#endif
""")
    self.pp.do_include(f)
    self.assertEqual(self.pp.out.getvalue(), "string value is false\n")
  
  def test_number_value(self):
    f = NamedIO("string_value.in", """#define FOO 1
#if FOO
number value is true
#else
number value is false
#endif
""")
    self.pp.do_include(f)
    self.assertEqual(self.pp.out.getvalue(), "number value is true\n")
  
  def test_conditional_if_0_elif_1(self):
    f = NamedIO('conditional_if_0_elif_1.in', '''#if 0
#elif 1
PASS
#else
FAIL
#endif
''')
    self.pp.do_include(f)
    self.assertEqual(self.pp.out.getvalue(), "PASS\n")
  
  def test_conditional_if_1(self):
    f = NamedIO('conditional_if_1.in', '''#if 1
PASS
#else
FAILE
#endif
''')
    self.pp.do_include(f)
    self.assertEqual(self.pp.out.getvalue(), "PASS\n")
  
  def test_conditional_if_1_elif_1_else(self):
    f = NamedIO('conditional_if_1_elif_1_else.in', '''#if 1
PASS
#elif 1
FAIL
#else
FAIL
#endif
''')
    self.pp.do_include(f)
    self.assertEqual(self.pp.out.getvalue(), "PASS\n")
  
  def test_conditional_if_1_if_1(self):
    f = NamedIO('conditional_if_1_if_1.in', '''#if 1
#if 1
PASS
#else
FAIL
#endif
#else
FAIL
#endif
''')
    self.pp.do_include(f)
    self.assertEqual(self.pp.out.getvalue(), "PASS\n")
  
  def test_conditional_not_0(self):
    f = NamedIO('conditional_not_0.in', '''#if !0
PASS
#else
FAIL
#endif
''')
    self.pp.do_include(f)
    self.assertEqual(self.pp.out.getvalue(), "PASS\n")
  
  def test_conditional_not_1(self):
    f = NamedIO('conditional_not_1.in', '''#if !1
FAIL
#else
PASS
#endif
''')
    self.pp.do_include(f)
    self.assertEqual(self.pp.out.getvalue(), "PASS\n")
  
  def test_conditional_not_emptyval(self):
    f = NamedIO('conditional_not_emptyval.in', '''#define EMPTYVAL
#if !EMPTYVAL
FAIL
#else
PASS
#endif
#if EMPTYVAL
PASS
#else
FAIL
#endif
''')
    self.pp.do_include(f)
    self.assertEqual(self.pp.out.getvalue(), "PASS\nPASS\n")
  
  def test_conditional_not_nullval(self):
    f = NamedIO('conditional_not_nullval.in', '''#define NULLVAL 0
#if !NULLVAL
PASS
#else
FAIL
#endif
''')
    self.pp.do_include(f)
    self.assertEqual(self.pp.out.getvalue(), "PASS\n")
  
  def test_expand(self):
    f = NamedIO('expand.in', '''#define ASVAR AS
#expand P__ASVAR__S
''')
    self.pp.do_include(f)
    self.assertEqual(self.pp.out.getvalue(), "PASS\n")

  def test_undef_defined(self):
    f = NamedIO('undef_defined.in', '''#define BAR
#undef BAR
BAR
''')
    self.pp.do_include(f)
    self.assertEqual(self.pp.out.getvalue(), "BAR\n")

  def test_undef_undefined(self):
    f = NamedIO('undef_undefined.in', '''#undef VAR
''')
    self.pp.do_include(f)
    self.assertEqual(self.pp.out.getvalue(), "")
  
  def test_filter_attemptSubstitution(self):
    f = NamedIO('filter_attemptSubstitution.in', '''#filter attemptSubstitution
P@VAR@ASS
#unfilter attemptSubstitution
''')
    self.pp.do_include(f)
    self.assertEqual(self.pp.out.getvalue(), "PASS\n")
  
  def test_filter_emptyLines(self):
    f = NamedIO('filter_emptyLines.in', '''lines with a

blank line
#filter emptyLines
lines with

no blank lines
#unfilter emptyLines
yet more lines with

blank lines
''')
    self.pp.do_include(f)
    self.assertEqual(self.pp.out.getvalue(), '''lines with a

blank line
lines with
no blank lines
yet more lines with

blank lines
''')
  
  def test_filter_slashslash(self):
    f = NamedIO('filter_slashslash.in', '''#filter slashslash
PASS//FAIL  // FAIL
#unfilter slashslash
PASS // PASS
''')
    self.pp.do_include(f)
    self.assertEqual(self.pp.out.getvalue(), "PASS\nPASS // PASS\n")
  
  def test_filter_spaces(self):
    f = NamedIO('filter_spaces.in', '''#filter spaces
You should see two nice ascii tables
 +-+-+-+
 | |   |     |
 +-+-+-+
#unfilter spaces
+-+---+
| |   |
+-+---+ 
''')
    self.pp.do_include(f)
    self.assertEqual(self.pp.out.getvalue(), """You should see two nice ascii tables
+-+-+-+
| | | |
+-+-+-+
+-+---+
| |   |
+-+---+ 
""")
  
  def test_filter_substitution(self):
    f = NamedIO('filter_substitution.in', '''#define VAR ASS
#filter substitution
P@VAR@
#unfilter substitution
''')
    self.pp.do_include(f)
    self.assertEqual(self.pp.out.getvalue(), "PASS\n")

  def test_error(self):
    f = NamedIO('error.in', '''#error spit this message out
''')
    caught_msg = None
    try:
      self.pp.do_include(f)
    except Preprocessor.Error, e:
      caught_msg = e.args[0][-1]
    self.assertEqual(caught_msg, 'spit this message out')
def main():
    pp = Preprocessor()
    tdpath = 'dataset/test/test-data-1'
    pp.process_test_data(tdpath)
示例#56
0
 def setUp(self):
   self.pp = Preprocessor()
   self.pp.out = StringIO()
   self.tempnam = os.tempnam('.')
class JarMaker(object):
  '''JarMaker reads jar.mn files and process those into jar files or
  flat directories, along with chrome.manifest files.
  '''

  ignore = re.compile('\s*(\#.*)?$')
  jarline = re.compile('(?:(?P<jarfile>[\w\d.\-\_\\\/]+).jar\:)|(?:\s*(\#.*)?)\s*$')
  relsrcline = re.compile('relativesrcdir\s+(?P<relativesrcdir>.+?):')
  regline = re.compile('\%\s+(.*)$')
  entryre = '(?P<optPreprocess>\*)?(?P<optOverwrite>\+?)\s+'
  entryline = re.compile(entryre + '(?P<output>[\w\d.\-\_\\\/\+\@]+)\s*(\((?P<locale>\%?)(?P<source>[\w\d.\-\_\\\/\@]+)\))?\s*$')

  def __init__(self, outputFormat = 'flat', useJarfileManifest = True,
               useChromeManifest = False):
    self.outputFormat = outputFormat
    self.useJarfileManifest = useJarfileManifest
    self.useChromeManifest = useChromeManifest
    self.pp = Preprocessor()
    self.topsourcedir = None
    self.sourcedirs = []
    self.localedirs = None
    self.l10nbase = None
    self.l10nmerge = None
    self.relativesrcdir = None
    self.rootManifestAppId = None

  def getCommandLineParser(self):
    '''Get a optparse.OptionParser for jarmaker.

    This OptionParser has the options for jarmaker as well as
    the options for the inner PreProcessor.
    '''
    # HACK, we need to unescape the string variables we get,
    # the perl versions didn't grok strings right
    p = self.pp.getCommandLineParser(unescapeDefines = True)
    p.add_option('-f', type="choice", default="jar",
                 choices=('jar', 'flat', 'symlink'),
                 help="fileformat used for output", metavar="[jar, flat, symlink]")
    p.add_option('-v', action="store_true", dest="verbose",
                 help="verbose output")
    p.add_option('-q', action="store_false", dest="verbose",
                 help="verbose output")
    p.add_option('-e', action="store_true",
                 help="create chrome.manifest instead of jarfile.manifest")
    p.add_option('--both-manifests', action="store_true",
                 dest="bothManifests",
                 help="create chrome.manifest and jarfile.manifest")
    p.add_option('-s', type="string", action="append", default=[],
                 help="source directory")
    p.add_option('-t', type="string",
                 help="top source directory")
    p.add_option('-c', '--l10n-src', type="string", action="append",
                 help="localization directory")
    p.add_option('--l10n-base', type="string", action="store",
                 help="base directory to be used for localization (requires relativesrcdir)")
    p.add_option('--locale-mergedir', type="string", action="store",
                 help="base directory to be used for l10n-merge (requires l10n-base and relativesrcdir)")
    p.add_option('--relativesrcdir', type="string",
                 help="relativesrcdir to be used for localization")
    p.add_option('-j', type="string",
                 help="jarfile directory")
    p.add_option('--root-manifest-entry-appid', type="string",
                 help="add an app id specific root chrome manifest entry.")
    return p

  def processIncludes(self, includes):
    '''Process given includes with the inner PreProcessor.

    Only use this for #defines, the includes shouldn't generate
    content.
    '''
    self.pp.out = StringIO()
    for inc in includes:
      self.pp.do_include(inc)
    includesvalue = self.pp.out.getvalue()
    if includesvalue:
      logging.info("WARNING: Includes produce non-empty output")
    self.pp.out = None
    pass

  def finalizeJar(self, jarPath, chromebasepath, register,
                  doZip=True):
    '''Helper method to write out the chrome registration entries to
    jarfile.manifest or chrome.manifest, or both.

    The actual file processing is done in updateManifest.
    '''
    # rewrite the manifest, if entries given
    if not register:
      return

    chromeManifest = os.path.join(os.path.dirname(jarPath),
                                  '..', 'chrome.manifest')

    if self.useJarfileManifest:
      self.updateManifest(jarPath + '.manifest', chromebasepath % '',
                          register)
      addEntriesToListFile(chromeManifest, ['manifest chrome/%s.manifest' % (os.path.basename(jarPath),)])
    if self.useChromeManifest:
      self.updateManifest(chromeManifest, chromebasepath % 'chrome/',
                          register)

    # If requested, add a root chrome manifest entry (assumed to be in the parent directory
    # of chromeManifest) with the application specific id. In cases where we're building
    # lang packs, the root manifest must know about application sub directories.
    if self.rootManifestAppId:
      rootChromeManifest = os.path.join(os.path.normpath(os.path.dirname(chromeManifest)),
                                        '..', 'chrome.manifest')
      rootChromeManifest = os.path.normpath(rootChromeManifest)
      chromeDir = os.path.basename(os.path.dirname(os.path.normpath(chromeManifest)))
      logging.info("adding '%s' entry to root chrome manifest appid=%s" % (chromeDir, self.rootManifestAppId))
      addEntriesToListFile(rootChromeManifest, ['manifest %s/chrome.manifest application=%s' % (chromeDir, self.rootManifestAppId)])

  def updateManifest(self, manifestPath, chromebasepath, register):
    '''updateManifest replaces the % in the chrome registration entries
    with the given chrome base path, and updates the given manifest file.
    '''
    lock = lockFile(manifestPath + '.lck')
    try:
      myregister = dict.fromkeys(map(lambda s: s.replace('%', chromebasepath),
                                     register.iterkeys()))
      manifestExists = os.path.isfile(manifestPath)
      mode = (manifestExists and 'r+b') or 'wb'
      mf = open(manifestPath, mode)
      if manifestExists:
        # import previous content into hash, ignoring empty ones and comments
        imf = re.compile('(#.*)?$')
        for l in re.split('[\r\n]+', mf.read()):
          if imf.match(l):
            continue
          myregister[l] = None
        mf.seek(0)
      for k in myregister.iterkeys():
        mf.write(k + os.linesep)
      mf.close()
    finally:
      lock = None

  def makeJar(self, infile, jardir):
    '''makeJar is the main entry point to JarMaker.

    It takes the input file, the output directory, the source dirs and the
    top source dir as argument, and optionally the l10n dirs.
    '''
    # making paths absolute, guess srcdir if file and add to sourcedirs
    _normpath = lambda p: os.path.normpath(os.path.abspath(p))
    self.topsourcedir = _normpath(self.topsourcedir)
    self.sourcedirs = [_normpath(p) for p in self.sourcedirs]
    if self.localedirs:
      self.localedirs = [_normpath(p) for p in self.localedirs]
    elif self.relativesrcdir:
      self.localedirs = self.generateLocaleDirs(self.relativesrcdir)
    if isinstance(infile, basestring):
      logging.info("processing " + infile)
      self.sourcedirs.append(_normpath(os.path.dirname(infile)))
    pp = self.pp.clone()
    pp.out = StringIO()
    pp.do_include(infile)
    lines = pushback_iter(pp.out.getvalue().splitlines())
    try:
      while True:
        l = lines.next()
        m = self.jarline.match(l)
        if not m:
          raise RuntimeError(l)
        if m.group('jarfile') is None:
          # comment
          continue
        self.processJarSection(m.group('jarfile'), lines, jardir)
    except StopIteration:
      # we read the file
      pass
    return

  def generateLocaleDirs(self, relativesrcdir):
    if os.path.basename(relativesrcdir) == 'locales':
      # strip locales
      l10nrelsrcdir = os.path.dirname(relativesrcdir)
    else:
      l10nrelsrcdir = relativesrcdir
    locdirs = []
    # generate locales dirs, merge, l10nbase, en-US
    if self.l10nmerge:
      locdirs.append(os.path.join(self.l10nmerge, l10nrelsrcdir))
    if self.l10nbase:
      locdirs.append(os.path.join(self.l10nbase, l10nrelsrcdir))
    if self.l10nmerge or not self.l10nbase:
      # add en-US if we merge, or if it's not l10n
      locdirs.append(os.path.join(self.topsourcedir, relativesrcdir, 'en-US'))
    return locdirs

  def processJarSection(self, jarfile, lines, jardir):
    '''Internal method called by makeJar to actually process a section
    of a jar.mn file.

    jarfile is the basename of the jarfile or the directory name for 
    flat output, lines is a pushback_iterator of the lines of jar.mn,
    the remaining options are carried over from makeJar.
    '''

    # chromebasepath is used for chrome registration manifests
    # %s is getting replaced with chrome/ for chrome.manifest, and with
    # an empty string for jarfile.manifest
    chromebasepath = '%s' + os.path.basename(jarfile)
    if self.outputFormat == 'jar':
      chromebasepath = 'jar:' + chromebasepath + '.jar!'
    chromebasepath += '/'

    jarfile = os.path.join(jardir, jarfile)
    jf = None
    if self.outputFormat == 'jar':
      #jar
      jarfilepath = jarfile + '.jar'
      try:
        os.makedirs(os.path.dirname(jarfilepath))
      except OSError, error:
        if error.errno != errno.EEXIST:
          raise
      jf = ZipFile(jarfilepath, 'a', lock = True)
      outHelper = self.OutputHelper_jar(jf)
    else:
示例#58
0
文件: main.py 项目: kiranp11/ctr
    def run(self):
        preprocessor = Preprocessor()
        preprocessor.load_encoder()

        classifier = Classifier(preprocessor)
        classifier.run()
示例#59
0
class JarMaker(object):
    '''JarMaker reads jar.mn files and process those into jar files or
  flat directories, along with chrome.manifest files.
  '''

    ignore = re.compile('\s*(\#.*)?$')
    jarline = re.compile(
        '(?:(?P<jarfile>[\w\d.\-\_\\\/]+).jar\:)|(?:\s*(\#.*)?)\s*$')
    regline = re.compile('\%\s+(.*)$')
    entryre = '(?P<optPreprocess>\*)?(?P<optOverwrite>\+?)\s+'
    entryline = re.compile(
        entryre +
        '(?P<output>[\w\d.\-\_\\\/\+]+)\s*(\((?P<locale>\%?)(?P<source>[\w\d.\-\_\\\/]+)\))?\s*$'
    )

    def __init__(self,
                 outputFormat='flat',
                 useJarfileManifest=True,
                 useChromeManifest=False):
        self.outputFormat = outputFormat
        self.useJarfileManifest = useJarfileManifest
        self.useChromeManifest = useChromeManifest
        self.pp = Preprocessor()

    def getCommandLineParser(self):
        '''Get a optparse.OptionParser for jarmaker.

    This OptionParser has the options for jarmaker as well as
    the options for the inner PreProcessor.
    '''
        # HACK, we need to unescape the string variables we get,
        # the perl versions didn't grok strings right
        p = self.pp.getCommandLineParser(unescapeDefines=True)
        p.add_option(
            '-f',
            type="choice",
            default="jar",
            choices=('jar', 'flat', 'symlink'),
            help="fileformat used for output",
            metavar="[jar, flat, symlink]")
        p.add_option(
            '-v', action="store_true", dest="verbose", help="verbose output")
        p.add_option(
            '-q', action="store_false", dest="verbose", help="verbose output")
        p.add_option(
            '-e',
            action="store_true",
            help="create chrome.manifest instead of jarfile.manifest")
        p.add_option(
            '--both-manifests',
            action="store_true",
            dest="bothManifests",
            help="create chrome.manifest and jarfile.manifest")
        p.add_option(
            '-s',
            type="string",
            action="append",
            default=[],
            help="source directory")
        p.add_option('-t', type="string", help="top source directory")
        p.add_option(
            '-c',
            '--l10n-src',
            type="string",
            action="append",
            help="localization directory")
        p.add_option(
            '--l10n-base',
            type="string",
            action="append",
            default=[],
            help="base directory to be used for localization (multiple)")
        p.add_option('-j', type="string", help="jarfile directory")
        # backwards compat, not needed
        p.add_option(
            '-a',
            action="store_false",
            default=True,
            help=
            "NOT SUPPORTED, turn auto-registration of chrome off (installed-chrome.txt)"
        )
        p.add_option('-d', type="string", help="UNUSED, chrome directory")
        p.add_option('-o', help="cross compile for auto-registration, ignored")
        p.add_option(
            '-l',
            action="store_true",
            help="ignored (used to switch off locks)")
        p.add_option('-x', action="store_true", help="force Unix")
        p.add_option('-z', help="backwards compat, ignored")
        p.add_option('-p', help="backwards compat, ignored")
        return p

    def processIncludes(self, includes):
        '''Process given includes with the inner PreProcessor.

    Only use this for #defines, the includes shouldn't generate
    content.
    '''
        self.pp.out = StringIO()
        for inc in includes:
            self.pp.do_include(inc)
        includesvalue = self.pp.out.getvalue()
        if includesvalue:
            logging.info("WARNING: Includes produce non-empty output")
        self.pp.out = None
        pass

    def finalizeJar(self, jarPath, chromebasepath, register, doZip=True):
        '''Helper method to write out the chrome registration entries to
    jarfile.manifest or chrome.manifest, or both.

    The actual file processing is done in updateManifest.
    '''
        # rewrite the manifest, if entries given
        if not register:
            return
        if self.useJarfileManifest:
            self.updateManifest(jarPath + '.manifest', chromebasepath % '',
                                register)
        if self.useChromeManifest:
            manifestPath = os.path.join(
                os.path.dirname(jarPath), '..', 'chrome.manifest')
            self.updateManifest(manifestPath, chromebasepath % 'chrome/',
                                register)

    def updateManifest(self, manifestPath, chromebasepath, register):
        '''updateManifest replaces the % in the chrome registration entries
    with the given chrome base path, and updates the given manifest file.
    '''
        myregister = dict.fromkeys(
            map(lambda s: s.replace('%', chromebasepath), register.iterkeys()))
        manifestExists = os.path.isfile(manifestPath)
        mode = (manifestExists and 'r+b') or 'wb'
        mf = open(manifestPath, mode)
        if manifestExists:
            # import previous content into hash, ignoring empty ones and comments
            imf = re.compile('(#.*)?$')
            for l in re.split('[\r\n]+', mf.read()):
                if imf.match(l):
                    continue
                myregister[l] = None
            mf.seek(0)
        for k in myregister.iterkeys():
            mf.write(k + os.linesep)
        mf.close()

    def makeJar(self,
                infile=None,
                jardir='',
                sourcedirs=[],
                topsourcedir='',
                localedirs=None):
        '''makeJar is the main entry point to JarMaker.

    It takes the input file, the output directory, the source dirs and the
    top source dir as argument, and optionally the l10n dirs.
    '''
        if isinstance(infile, basestring):
            logging.info("processing " + infile)
        pp = self.pp.clone()
        pp.out = StringIO()
        pp.do_include(infile)
        lines = pushback_iter(pp.out.getvalue().splitlines())
        try:
            while True:
                l = lines.next()
                m = self.jarline.match(l)
                if not m:
                    raise RuntimeError(l)
                if m.group('jarfile') is None:
                    # comment
                    continue
                self.processJarSection(
                    m.group('jarfile'), lines, jardir, sourcedirs,
                    topsourcedir, localedirs)
        except StopIteration:
            # we read the file
            pass
        return

    def makeJars(self,
                 infiles,
                 l10nbases,
                 jardir='',
                 sourcedirs=[],
                 topsourcedir='',
                 localedirs=None):
        '''makeJars is the second main entry point to JarMaker.

    It takes an iterable sequence of input file names, the l10nbases,
    the output directory, the source dirs and the
    top source dir as argument, and optionally the l10n dirs.

    It iterates over all inputs, guesses srcdir and l10ndir from the
    path and topsourcedir and calls into makeJar.

    The l10ndirs are created by guessing the relativesrcdir, and resolving
    that against the l10nbases. l10nbases can either be path strings, or 
    callables. In the latter case, that will be called with the 
    relativesrcdir as argument, and is expected to return a path string.
    This logic is disabled if the jar.mn path is not inside the topsrcdir.
    '''
        topsourcedir = os.path.normpath(os.path.abspath(topsourcedir))

        def resolveL10nBase(relpath):
            def _resolve(base):
                if isinstance(base, basestring):
                    return os.path.join(base, relpath)
                if callable(base):
                    return base(relpath)
                return base

            return _resolve

        for infile in infiles:
            srcdir = os.path.normpath(os.path.abspath(os.path.dirname(infile)))
            l10ndir = srcdir
            if os.path.basename(srcdir) == 'locales':
                l10ndir = os.path.dirname(l10ndir)

            l10ndirs = None
            # srcdir may not be a child of topsourcedir, in which case
            # we assume that the caller passed in suitable sourcedirs,
            # and just skip passing in localedirs
            if srcdir.startswith(topsourcedir):
                rell10ndir = l10ndir[len(topsourcedir):].lstrip(os.sep)

                l10ndirs = map(resolveL10nBase(rell10ndir), l10nbases)
                if localedirs is not None:
                    l10ndirs += [
                        os.path.normpath(os.path.abspath(s))
                        for s in localedirs
                    ]
            srcdirs = [
                os.path.normpath(os.path.abspath(s)) for s in sourcedirs
            ] + [srcdir]
            self.makeJar(
                infile=infile,
                sourcedirs=srcdirs,
                topsourcedir=topsourcedir,
                localedirs=l10ndirs,
                jardir=jardir)

    def processJarSection(self, jarfile, lines, jardir, sourcedirs,
                          topsourcedir, localedirs):
        '''Internal method called by makeJar to actually process a section
    of a jar.mn file.

    jarfile is the basename of the jarfile or the directory name for 
    flat output, lines is a pushback_iterator of the lines of jar.mn,
    the remaining options are carried over from makeJar.
    '''

        # chromebasepath is used for chrome registration manifests
        # %s is getting replaced with chrome/ for chrome.manifest, and with
        # an empty string for jarfile.manifest
        chromebasepath = '%s' + jarfile
        if self.outputFormat == 'jar':
            chromebasepath = 'jar:' + chromebasepath + '.jar!'
        chromebasepath += '/'

        jarfile = os.path.join(jardir, jarfile)
        jf = None
        if self.outputFormat == 'jar':
            #jar
            jarfilepath = jarfile + '.jar'
            try:
                os.makedirs(os.path.dirname(jarfilepath))
            except OSError:
                pass
            jf = ZipFile(jarfilepath, 'a', lock=True)
            outHelper = self.OutputHelper_jar(jf)
        else:
            outHelper = getattr(self,
                                'OutputHelper_' + self.outputFormat)(jarfile)
        register = {}
        # This loop exits on either
        # - the end of the jar.mn file
        # - an line in the jar.mn file that's not part of a jar section
        # - on an exception raised, close the jf in that case in a finally
        try:
            while True:
                try:
                    l = lines.next()
                except StopIteration:
                    # we're done with this jar.mn, and this jar section
                    self.finalizeJar(jarfile, chromebasepath, register)
                    if jf is not None:
                        jf.close()
                    # reraise the StopIteration for makeJar
                    raise
                if self.ignore.match(l):
                    continue
                m = self.regline.match(l)
                if m:
                    rline = m.group(1)
                    register[rline] = 1
                    continue
                m = self.entryline.match(l)
                if not m:
                    # neither an entry line nor chrome reg, this jar section is done
                    self.finalizeJar(jarfile, chromebasepath, register)
                    if jf is not None:
                        jf.close()
                    lines.pushback(l)
                    return
                self._processEntryLine(m, sourcedirs, topsourcedir, localedirs,
                                       outHelper, jf)
        finally:
            if jf is not None:
                jf.close()
        return

    def _processEntryLine(self, m, sourcedirs, topsourcedir, localedirs,
                          outHelper, jf):
        out = m.group('output')
        src = m.group('source') or os.path.basename(out)
        # pick the right sourcedir -- l10n, topsrc or src
        if m.group('locale'):
            src_base = localedirs
        elif src.startswith('/'):
            # path/in/jar/file_name.xul     (/path/in/sourcetree/file_name.xul)
            # refers to a path relative to topsourcedir, use that as base
            # and strip the leading '/'
            src_base = [topsourcedir]
            src = src[1:]
        else:
            # use srcdirs and the objdir (current working dir) for relative paths
            src_base = sourcedirs + ['.']
        # check if the source file exists
        realsrc = None
        for _srcdir in src_base:
            if os.path.isfile(os.path.join(_srcdir, src)):
                realsrc = os.path.join(_srcdir, src)
                break
        if realsrc is None:
            if jf is not None:
                jf.close()
            raise RuntimeError(
                'File "%s" not found in %s' % (src, ', '.join(src_base)))
        if m.group('optPreprocess'):
            outf = outHelper.getOutput(out)
            inf = open(realsrc)
            pp = self.pp.clone()
            if src[-4:] == '.css':
                pp.setMarker('%')
            pp.out = outf
            pp.do_include(inf)
            outf.close()
            inf.close()
            return
        # copy or symlink if newer or overwrite
        if (m.group('optOverwrite')
                or (getModTime(realsrc) > outHelper.getDestModTime(
                    m.group('output')))):
            if self.outputFormat == 'symlink' and hasattr(os, 'symlink'):
                outHelper.symlink(realsrc, out)
                return
            outf = outHelper.getOutput(out)
            # open in binary mode, this can be images etc
            inf = open(realsrc, 'rb')
            outf.write(inf.read())
            outf.close()
            inf.close()

    class OutputHelper_jar(object):
        '''Provide getDestModTime and getOutput for a given jarfile.
    '''

        def __init__(self, jarfile):
            self.jarfile = jarfile

        def getDestModTime(self, aPath):
            try:
                info = self.jarfile.getinfo(aPath)
                return info.date_time
            except:
                return 0

        def getOutput(self, name):
            return ZipEntry(name, self.jarfile)

    class OutputHelper_flat(object):
        '''Provide getDestModTime and getOutput for a given flat
    output directory. The helper method ensureDirFor is used by
    the symlink subclass.
    '''

        def __init__(self, basepath):
            self.basepath = basepath

        def getDestModTime(self, aPath):
            return getModTime(os.path.join(self.basepath, aPath))

        def getOutput(self, name):
            out = self.ensureDirFor(name)
            # remove previous link or file
            try:
                os.remove(out)
            except OSError, e:
                if e.errno != 2:
                    raise
            return open(out, 'wb')

        def ensureDirFor(self, name):
            out = os.path.join(self.basepath, name)
            outdir = os.path.dirname(out)
            if not os.path.isdir(outdir):
                os.makedirs(outdir)
            return out