def load_from_files(self, directory_name, instances_filename=None, labels_filename=None, name=None): """Load sample from the directory directory_name. Args: directory_name (string): Name of directory to read files from. instances_filename (string): Name of the file containing the instances matrix in numpy compressed format. labels_filename (string, optional): Name of the file containing the labels in pickled format. name(string, optional): additional name to add into the dataset files. """ super(BaseSampledDataset, self).load_from_files(directory_name) if instances_filename: self._instances = numpy.load(instances_filename) if self._instances is None: logging.error('Error loading instances from file {}'.format( instances_filename )) if labels_filename: self._labels = utils.pickle_from_file(labels_filename) self._sample_indices = utils.pickle_from_file( self._get_objective_filename(directory_name, 'sample_indices', name)) self.samples_num = len(self._sample_indices)
def load_from_files(self, directory_name, instances_filename=None, labels_filename=None, name=None): """Builds dataset from files saved in the directory directory_name. Args: directory_name (string): Name of directory to read files from. instances_filename (string): Name of the file containing the instances matrix in numpy compressed format. labels_filename (string, optional): Name of the file containing the labels in pickled format. name(string, optional): additional name to add into the dataset files. """ instances = None if instances_filename: instances = numpy.load(instances_filename) if instances is None: logging.error('Error loading instances from file {}'.format( instances_filename )) labels = None if labels_filename: labels = utils.pickle_from_file(labels_filename) indices = utils.pickle_from_file(self._get_objective_filename( directory_name, 'indices', name)) self.create_from_matrixes(instances, indices, labels)
def main(): args = docopt(__doc__, version=1.0) mapping = utils.pickle_from_file(args['<mapping_filename>']) graph = utils.pickle_from_file(args['<graph_filename>']) yago_to_lkif = invert_mapping(mapping) for node in graph.nodes(): if len(yago_to_lkif[node]) != 0: continue for ancestor in get_oldest_ancestors(node, graph): yago_to_lkif[node].update(yago_to_lkif[ancestor]) utils.pickle_to_file(dict(yago_to_lkif), args['<output_file>'])
def main(): """Main function of script.""" args = utils.read_arguments(__doc__) # Read dataset. Each row of x_matrix is a sentence. x_matrix, y_vector = utils.pickle_from_file(args['input_filename']) # Get Stanford model parser = StanfordParser( model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', encoding='utf8') # Get parse trees. parsed_matrix = [] for index, document in tqdm(enumerate(x_matrix), total=len(x_matrix)): parsed_document = [] for paragraph_index, paragraph in enumerate(document): parsed_paragraph = [] for sentence_index, sentence in enumerate(paragraph): try: parsed_paragraph.append( list( parser.raw_parse( six.text_type(sentence.decode('utf-8'))))) except UnicodeDecodeError: logging.warning( 'Skip sentence {}-{}-{} for unicode error'.format( index, paragraph_index, sentence_index)) y_vector[index].pop(sentence_index) parsed_document.append(parsed_paragraph) parsed_matrix.append(parsed_document) # Save output logging.info('Saving {} documents'.format(len(parsed_matrix))) utils.pickle_to_file((parsed_matrix, y_vector), args['output_filename']) logging.info('All operations finished')
def test_matrix(self): """Test the size of the sparse matrix generated.""" extractor = ConllFeatureExtractor(use_structural=True) self.document.parse_trees = utils.pickle_from_file( os.path.join('test_files', 'parse_trees.pickle')) matrix = extractor.transform([self.document]) self.assertIsInstance(matrix, csr_matrix) self.assertEqual( sum([len(sentence.words) for sentence in self.document.sentences]), matrix.shape[0])
def test_lexical_features(self): """Test extraction of syntactic features.""" extractor = ConllFeatureExtractor(use_structural=False, use_lexical=True) self.document.parse_trees = utils.pickle_from_file( os.path.join('test_files', 'parse_trees.pickle')) features = extractor.get_lexical_features(self.document)[3][0] self.assertEqual('PP[From/IN]', features['ls:token_comb']) self.assertEqual('IN[From/IN]-NP[perspective/NN]', features['ls:right_comb'])
def main(): """Main function of script""" args = utils.read_arguments(__doc__) documents = utils.pickle_from_file(args['input_filename']) transformer = conll_feature_extractor.ConllFeatureExtractor( use_structural=True, use_syntactic=True, # use_lexical=True ) # Extract instances and labels. Each instance is a sentence, represented as # a list of feature dictionaries for each work. Labels are represented as # a list of word labels. instances = transformer.get_feature_dict(documents) labels = conll_feature_extractor.get_labels_from_documents(documents) x_train, x_test, y_train, y_test = train_test_split(instances, labels, test_size=0.33) classifier = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) if not args['search_grid']: classifier.fit(x_train, y_train) predictions = list(itertools.chain(*classifier.predict(x_test))) evaluation.log_report(predictions, list(itertools.chain(*y_test))) else: # label_names = list(classifier.classes_) # label_names.remove('O') params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), } f1_scorer = metrics.make_scorer( suite_metrics.flat_f1_score, average='weighted') #, labels=label_names) # search rs = RandomizedSearchCV(classifier, params_space, cv=3, verbose=1, n_jobs=-1, n_iter=50, scoring=f1_scorer) rs.fit(x_train, y_train) print('best params:', rs.best_params_) print('best CV score:', rs.best_score_) classifier = rs.best_estimator_ predictions = list(itertools.chain(*classifier.predict(x_test))) evaluation.log_report(predictions, list(itertools.chain(*y_test)))
def main(): """Main function of script.""" args = utils.read_arguments(__doc__) # Read dataset x_matrix, y_vector = utils.pickle_from_file(args['input_filename']) stemmer = WordNetLemmatizer() with open(args['output_filename'], 'w') as output_file: for tree, label in zip(x_matrix, y_vector): output_file.write(instance_line(tree[0], label, stemmer) + '\n')
def test_syntactic_features(self): """Test extraction of syntactic features.""" extractor = ConllFeatureExtractor(use_structural=False, use_syntactic=True) self.document.parse_trees = utils.pickle_from_file( os.path.join('test_files', 'parse_trees.pickle')) features = extractor.get_syntactic_features(self.document)[3][0] self.assertEqual('IN', features['syn:pos']) self.assertEqual(2, features['syn:lca:next']) self.assertEqual('PP', features['syn:lca:next_tag']) self.assertEqual(10, features['syn:lca:prev']) # Height of the tree + 1 self.assertEqual('', features['syn:lca:prev_tag'])
def get_graph(graph_filename, category_filename): if graph_filename and os.path.isfile(graph_filename): print 'Reading pickled graph' hierarchy_graph = utils.pickle_from_file(graph_filename) else: hierarchy_graph = networkx.DiGraph() categories = utils.get_categories_from_file(category_filename) print 'Downloading categories' for category_name in tqdm(categories): utils.add_subcategories(category_name, hierarchy_graph) if graph_filename: print 'Saving graph' utils.pickle_to_file(hierarchy_graph, category_filename) return hierarchy_graph
def main(): """Main function of script""" args = utils.read_arguments(__doc__) print('Loading documents') documents = utils.pickle_from_file(args['input_filename']) transformer = conll_feature_extractor.ConllFeatureExtractor( use_structural=True, use_syntactic=True, use_lexical=True) # Extract instances and labels. Each instance is a sentence, represented as # a list of feature dictionaries for each work. instances = transformer.get_feature_dict(documents) utils.pickle_to_file(instances, args['output_filename']) print('All operations completed')
def main(): """Main function of script""" args = utils.read_arguments(__doc__) documents = utils.pickle_from_file(args['input_filename']) if args['separation'] in ['sentence', 'paragraph', 'section']: separation = args['separation'] else: separation = 'sentence' with open(args['output_filename'], 'w') as output_file: writer = DocumentWriter(output_file, include_relations=args['include_relations'], separation=separation) for document in tqdm(documents): if document.has_annotation(): print('Adding document {}'.format(document.identifier)) writer.write_document(document)
def test_get_parent_siblings(self): """Test the function get_parent_sibling.""" tree = utils.pickle_from_file( os.path.join('test_files', 'parse_trees.pickle'))[0] expected_pairs = ((u',[,/,]', u'NP[importance/NN]'), (u'DT[the/DT]', u'NN[importance/NN]'), (u'IN[of/IN]', u'NP[vehicles/NNS]'), (u'VBZ[has/VBZ]', u'VP[been/VBN]'), (u'VBN[been/VBN]', u'ADVP[widely/RB]'), (u'VBN[recognized/VBN]', u'NP[thoughout/NN]'), (u'DT[the/DT]', u'NN[world/NN]')) result = [] for leaf_index in range(len(tree.leaves())): pair = get_parent_sibling(tree, leaf_index) if pair: result.append(pair) for expected_pair, resulting_pair in zip(expected_pairs, result): self.assertEqual(expected_pair, resulting_pair)
def main(): """Script main function""" args = utils.read_arguments(__doc__) # Read dataset x_matrix, y_vector = utils.pickle_from_file(args['input_filename']) classifier = process_pipeline.get_basic_tree_pipeline( ('clf', LogisticRegression(C=0.1, n_jobs=-1))) classifier.set_params(**get_optimized_params()) if args['search_grid']: parameters = process_pipeline.get_tree_parameter_grid() parameters['clf__C'] = (1, 0.3, 0.1, 0.05, 0.01) evaluation.evaluate_grid_search(x_matrix, y_vector, classifier, parameters, log_file='logs/log-grid-lr') else: evaluation.deep_evaluate(x_matrix, y_vector, classifier)
def main(): """Script main function""" args = utils.read_arguments(__doc__) # Read dataset x_matrix, y_vector = utils.pickle_from_file(args['input_filename']) classifier = Pipeline([ ('vect', CountVectorizer(ngram_range=(1, 3), max_features=10**4)), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB(alpha=2)), ]) evaluation.evaluate(x_matrix, y_vector, classifier) parameters = { 'vect__ngram_range': [(1, 1), (1, 2), (1, 3)], 'vect__max_features': [10**3, 10**4, 10**5], 'tfidf__use_idf': (True, False), 'clf__alpha': (1, 2, 5, 10), } evaluation.evaluate_grid_search(x_matrix, y_vector, classifier, parameters)
def main(source_filename, output_filename): entities = set( [uri[2:] for uri in utils.pickle_from_file(source_filename)]) # Read previous downloaded seen_uris = set() if os.path.exists(output_filename): with jsonlines.open(output_filename) as reader: for object in reader: seen_uris.add(object.get('uri')) entities = entities.difference(seen_uris) # Write missing entities with jsonlines.open(output_filename, mode='w') as writer: for uri in tqdm(entities): try: labels = get_labels(uri) except (urllib.error.HTTPError, EndPointNotFound) as error: logging.error('Error for uri {}'.format(uri)) logging.error(error) writer.write({'uri': uri, 'labels': labels})
help="Measure of predictiveness to use", default='auc') parser.add_argument("--output-dir", type=str, help="Where to save off shapley values", default='../results/') parser.add_argument("--estimator-type", type=str, help="estimator to fit", default="nn") args = parser.parse_args() print("Running " + args.estimator_type + " for VIM measure " + args.measure) ## -------------------------------------------------- ## load the data, set up ## -------------------------------------------------- data = uts.pickle_from_file(args.dataset) p = data.x_train.shape[1] np.random.seed(args.seed) folds_outer = np.random.choice(a=np.arange(2), size=data.y_train.shape[0], replace=True, p=np.array([0.25, 0.75])) data_0 = dg.Dataset(x_train=data.x_train[folds_outer == 0, :], y_train=data.y_train[folds_outer == 0], x_test=None, y_test=None) data_1 = dg.Dataset(x_train=data.x_train[folds_outer == 1, :], y_train=data.y_train[folds_outer == 1], x_test=None, y_test=None) cc_all = (np.sum(np.isnan(data.x_train), axis=1) == 0)
def load_dataset(filename): pickled_object = utils.pickle_from_file(filename) return (pickled_object['embeddings'], pickled_object['mappings'], pickled_object['data'], pickled_object['datasets'])
#!/usr/local/bin/python3 ## turn icu dataset into a csv for use in R # required functions and libraries import utils as uts import numpy as np import sys import pandas as pd # load the dataset out = uts.pickle_from_file(sys.argv[1] + '/icu_data_processed.pkl') # turn it into a csv x_train, y_train, x_test, y_test = pd.DataFrame(data=out.x_train, index=np.array(range(1, out.x_train.shape[0] + 1)), columns=np.array(range(1, out.x_train.shape[1] + 1))), pd.DataFrame(data=out.y_train, index=np.array(range(1, out.y_train.shape[0] + 1)), columns=np.array(range(1, out.y_train.shape[1] + 1))), pd.DataFrame(data=out.x_test, index=np.array(range(1, out.x_test.shape[0] + 1)), columns=np.array(range(1, out.x_test.shape[1] + 1))), pd.DataFrame(data=out.y_test, index=np.array(range(1, out.y_test.shape[0] + 1)), columns=np.array(range(1, out.y_test.shape[1] + 1))) x_train.to_csv(sys.argv[1] + '/icu_data_processed_xtrain.csv') y_train.to_csv(sys.argv[1] + '/icu_data_processed_ytrain.csv') x_test.to_csv(sys.argv[1] + '/icu_data_processed_xtest.csv') y_test.to_csv(sys.argv[1] + '/icu_data_processed_ytest.csv')