def grep_articles(self): for docid in self.docid_set: self.logger.info('%s', docid) cmd = '/usr/bin/grep' param1 = docid param2 = FileLocations.get_dropbox_datasets_path( ) + 'washingtonpost/WashingtonPost/data/*.txt' self.logger.info('%s %s %s ', cmd, param1, param2) full_cmd = cmd + ' ' + param1 + ' ' + param2 + ' >> wp.txt' process = subprocess.Popen(full_cmd, shell=True, stdout=subprocess.PIPE) process.wait() self.logger.info('return code %d ', process.returncode)
'%(asctime)s %(name)-12s %(levelname)-8s %(message)s')) logger = logging.getLogger(__name__) logger.addHandler(handler) logger.propagate = False logger.setLevel(logging.INFO) smb = BaseDocToSentiment() datasetDexter = DatasetDexter wikipediaDataset = WikipediaDataset() if use_dexter_dataset: document_list = datasetDexter.get_dexter_dataset( path=FileLocations.get_dropbox_dexter_path()) if use_wahington_post_dataset: document_list = datasetDexter.get_dexter_dataset( path=FileLocations.get_dropbox_datasets_path() + 'washingtonpost/', filename="washington_post.json") spotter = GoldenSpotter(document_list, wikipediaDataset) golden_saliency_by_entid_by_docid = datasetDexter.get_golden_saliency_by_entid_by_docid( document_list, wikipediaDataset) if train_model: salience_by_entity_by_doc_id = smb.build_output_using_dexter_dataset( spotter, golden_saliency_by_entid_by_docid, output_filename, document_to_feature_converter, None, train_docid_set, wikipediaDataset,
aws_util = AWSUtil() smb = SelModelBuilder() # if build_model: # sentiment_processor = smb.train_and_save_model(filename) # else: # sentiment_processor = SentimentProcessor() # sentiment_processor.load_model(filename) dd = smb.get_dexter_datset() wikipediaDataset = WikipediaDataset() document_list = dd.get_dexter_dataset(path=FileLocations.get_dropbox_datasets_path()+'washingtonpost/', filename="washington_post.json") spotter = GoldenSpotter(document_list, wikipediaDataset) golden_saliency_by_entid_by_docid = dd.get_golden_saliency_by_entid_by_docid(document_list, wikipediaDataset) output_filename = FileLocations.get_dropbox_intermediate_path() + 'sel_all_features_golden_spotter.washington_post.docnum.'+ str(min_number) + '-' + str(max_number) + '.txt' heavy_feature_filename = FileLocations.get_temp_path() + 'sel_heavy_features_golden_spotter.washington_post.docnum.'+ str(min_number) + '-' + str(max_number) + '.txt' light_feature_filename = FileLocations.get_temp_path() + 'sel_light_features_golden_spotter.washington_post.docnum.'+ str(min_number) + '-' + str(max_number) + '.txt' document_to_feature_converter = SelFeatureExtractor(spotter, binary_classifier_threshold=0.5, min_candidates_to_pass_through = 5000, binary_classifier=None, light_feature_filename = light_feature_filename, heavy_feature_filename = heavy_feature_filename, num_light_features = 23, break_early = break_early)