예제 #1
0
    def load_from_files(self, directory_name, instances_filename=None,
                        labels_filename=None, name=None):
        """Load sample from the directory directory_name.

        Args:
            directory_name (string): Name of directory to read files from.
            instances_filename (string): Name of the file containing the
                instances matrix in numpy compressed format.
            labels_filename (string, optional): Name of the file containing the
                labels in pickled format.
            name(string, optional): additional name to add into the dataset
                files.
        """
        super(BaseSampledDataset, self).load_from_files(directory_name)
        if instances_filename:
            self._instances = numpy.load(instances_filename)
        if self._instances is None:
            logging.error('Error loading instances from file {}'.format(
                instances_filename
            ))
        if labels_filename:
            self._labels = utils.pickle_from_file(labels_filename)

        self._sample_indices = utils.pickle_from_file(
            self._get_objective_filename(directory_name,
                                         'sample_indices', name))
        self.samples_num = len(self._sample_indices)
예제 #2
0
    def load_from_files(self, directory_name, instances_filename=None,
                        labels_filename=None, name=None):
        """Builds dataset from files saved in the directory directory_name.

        Args:
            directory_name (string): Name of directory to read files from.
            instances_filename (string): Name of the file containing the
                instances matrix in numpy compressed format.
            labels_filename (string, optional): Name of the file containing the
                labels in pickled format.
            name(string, optional): additional name to add into the dataset
                files.
        """
        instances = None
        if instances_filename:
            instances = numpy.load(instances_filename)
        if instances is None:
            logging.error('Error loading instances from file {}'.format(
                instances_filename
            ))
        labels = None
        if labels_filename:
            labels = utils.pickle_from_file(labels_filename)
        indices = utils.pickle_from_file(self._get_objective_filename(
            directory_name, 'indices', name))
        self.create_from_matrixes(instances, indices, labels)
예제 #3
0
def main():
    args = docopt(__doc__, version=1.0)
    mapping = utils.pickle_from_file(args['<mapping_filename>'])
    graph = utils.pickle_from_file(args['<graph_filename>'])

    yago_to_lkif = invert_mapping(mapping)

    for node in graph.nodes():
        if len(yago_to_lkif[node]) != 0:
            continue
        for ancestor in get_oldest_ancestors(node, graph):
            yago_to_lkif[node].update(yago_to_lkif[ancestor])

    utils.pickle_to_file(dict(yago_to_lkif), args['<output_file>'])
예제 #4
0
def main():
    """Main function of script."""
    args = utils.read_arguments(__doc__)

    # Read dataset. Each row of x_matrix is a sentence.
    x_matrix, y_vector = utils.pickle_from_file(args['input_filename'])

    # Get Stanford model
    parser = StanfordParser(
        model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
        encoding='utf8')
    # Get parse trees.
    parsed_matrix = []
    for index, document in tqdm(enumerate(x_matrix), total=len(x_matrix)):
        parsed_document = []
        for paragraph_index, paragraph in enumerate(document):
            parsed_paragraph = []
            for sentence_index, sentence in enumerate(paragraph):
                try:
                    parsed_paragraph.append(
                        list(
                            parser.raw_parse(
                                six.text_type(sentence.decode('utf-8')))))
                except UnicodeDecodeError:
                    logging.warning(
                        'Skip sentence {}-{}-{} for unicode error'.format(
                            index, paragraph_index, sentence_index))
                    y_vector[index].pop(sentence_index)
            parsed_document.append(parsed_paragraph)
        parsed_matrix.append(parsed_document)

    # Save output
    logging.info('Saving {} documents'.format(len(parsed_matrix)))
    utils.pickle_to_file((parsed_matrix, y_vector), args['output_filename'])
    logging.info('All operations finished')
예제 #5
0
 def test_matrix(self):
     """Test the size of the sparse matrix generated."""
     extractor = ConllFeatureExtractor(use_structural=True)
     self.document.parse_trees = utils.pickle_from_file(
         os.path.join('test_files', 'parse_trees.pickle'))
     matrix = extractor.transform([self.document])
     self.assertIsInstance(matrix, csr_matrix)
     self.assertEqual(
         sum([len(sentence.words) for sentence in self.document.sentences]),
         matrix.shape[0])
예제 #6
0
 def test_lexical_features(self):
     """Test extraction of syntactic features."""
     extractor = ConllFeatureExtractor(use_structural=False,
                                       use_lexical=True)
     self.document.parse_trees = utils.pickle_from_file(
         os.path.join('test_files', 'parse_trees.pickle'))
     features = extractor.get_lexical_features(self.document)[3][0]
     self.assertEqual('PP[From/IN]', features['ls:token_comb'])
     self.assertEqual('IN[From/IN]-NP[perspective/NN]',
                      features['ls:right_comb'])
예제 #7
0
def main():
    """Main function of script"""
    args = utils.read_arguments(__doc__)
    documents = utils.pickle_from_file(args['input_filename'])

    transformer = conll_feature_extractor.ConllFeatureExtractor(
        use_structural=True,
        use_syntactic=True,  # use_lexical=True
    )
    # Extract instances and labels. Each instance is a sentence, represented as
    # a list of feature dictionaries for each work. Labels are represented as
    # a list of word labels.
    instances = transformer.get_feature_dict(documents)
    labels = conll_feature_extractor.get_labels_from_documents(documents)

    x_train, x_test, y_train, y_test = train_test_split(instances,
                                                        labels,
                                                        test_size=0.33)

    classifier = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                      c1=0.1,
                                      c2=0.1,
                                      max_iterations=100,
                                      all_possible_transitions=True)

    if not args['search_grid']:
        classifier.fit(x_train, y_train)
        predictions = list(itertools.chain(*classifier.predict(x_test)))

        evaluation.log_report(predictions, list(itertools.chain(*y_test)))
    else:
        # label_names = list(classifier.classes_)
        # label_names.remove('O')
        params_space = {
            'c1': scipy.stats.expon(scale=0.5),
            'c2': scipy.stats.expon(scale=0.05),
        }
        f1_scorer = metrics.make_scorer(
            suite_metrics.flat_f1_score,
            average='weighted')  #, labels=label_names)
        # search
        rs = RandomizedSearchCV(classifier,
                                params_space,
                                cv=3,
                                verbose=1,
                                n_jobs=-1,
                                n_iter=50,
                                scoring=f1_scorer)
        rs.fit(x_train, y_train)
        print('best params:', rs.best_params_)
        print('best CV score:', rs.best_score_)
        classifier = rs.best_estimator_
        predictions = list(itertools.chain(*classifier.predict(x_test)))
        evaluation.log_report(predictions, list(itertools.chain(*y_test)))
예제 #8
0
def main():
    """Main function of script."""
    args = utils.read_arguments(__doc__)

    # Read dataset
    x_matrix, y_vector = utils.pickle_from_file(args['input_filename'])
    stemmer = WordNetLemmatizer()

    with open(args['output_filename'], 'w') as output_file:
        for tree, label in zip(x_matrix, y_vector):
            output_file.write(instance_line(tree[0], label, stemmer) + '\n')
예제 #9
0
 def test_syntactic_features(self):
     """Test extraction of syntactic features."""
     extractor = ConllFeatureExtractor(use_structural=False,
                                       use_syntactic=True)
     self.document.parse_trees = utils.pickle_from_file(
         os.path.join('test_files', 'parse_trees.pickle'))
     features = extractor.get_syntactic_features(self.document)[3][0]
     self.assertEqual('IN', features['syn:pos'])
     self.assertEqual(2, features['syn:lca:next'])
     self.assertEqual('PP', features['syn:lca:next_tag'])
     self.assertEqual(10,
                      features['syn:lca:prev'])  # Height of the tree + 1
     self.assertEqual('', features['syn:lca:prev_tag'])
예제 #10
0
def get_graph(graph_filename, category_filename):
    if graph_filename and os.path.isfile(graph_filename):
        print 'Reading pickled graph'
        hierarchy_graph = utils.pickle_from_file(graph_filename)
    else:
        hierarchy_graph = networkx.DiGraph()
        categories = utils.get_categories_from_file(category_filename)
        print 'Downloading categories'
        for category_name in tqdm(categories):
            utils.add_subcategories(category_name, hierarchy_graph)
        if graph_filename:
            print 'Saving graph'
            utils.pickle_to_file(hierarchy_graph, category_filename)
    return hierarchy_graph
예제 #11
0
def main():
    """Main function of script"""
    args = utils.read_arguments(__doc__)
    print('Loading documents')
    documents = utils.pickle_from_file(args['input_filename'])

    transformer = conll_feature_extractor.ConllFeatureExtractor(
        use_structural=True, use_syntactic=True, use_lexical=True)
    # Extract instances and labels. Each instance is a sentence, represented as
    # a list of feature dictionaries for each work.
    instances = transformer.get_feature_dict(documents)

    utils.pickle_to_file(instances, args['output_filename'])

    print('All operations completed')
예제 #12
0
def main():
    """Main function of script"""
    args = utils.read_arguments(__doc__)
    documents = utils.pickle_from_file(args['input_filename'])
    if args['separation'] in ['sentence', 'paragraph', 'section']:
        separation = args['separation']
    else:
        separation = 'sentence'
    with open(args['output_filename'], 'w') as output_file:
        writer = DocumentWriter(output_file,
                                include_relations=args['include_relations'],
                                separation=separation)
        for document in tqdm(documents):
            if document.has_annotation():
                print('Adding document {}'.format(document.identifier))
                writer.write_document(document)
예제 #13
0
 def test_get_parent_siblings(self):
     """Test the function get_parent_sibling."""
     tree = utils.pickle_from_file(
         os.path.join('test_files', 'parse_trees.pickle'))[0]
     expected_pairs = ((u',[,/,]', u'NP[importance/NN]'),
                       (u'DT[the/DT]', u'NN[importance/NN]'),
                       (u'IN[of/IN]', u'NP[vehicles/NNS]'),
                       (u'VBZ[has/VBZ]', u'VP[been/VBN]'),
                       (u'VBN[been/VBN]', u'ADVP[widely/RB]'),
                       (u'VBN[recognized/VBN]',
                        u'NP[thoughout/NN]'), (u'DT[the/DT]',
                                               u'NN[world/NN]'))
     result = []
     for leaf_index in range(len(tree.leaves())):
         pair = get_parent_sibling(tree, leaf_index)
         if pair:
             result.append(pair)
     for expected_pair, resulting_pair in zip(expected_pairs, result):
         self.assertEqual(expected_pair, resulting_pair)
예제 #14
0
def main():
    """Script main function"""
    args = utils.read_arguments(__doc__)

    # Read dataset
    x_matrix, y_vector = utils.pickle_from_file(args['input_filename'])

    classifier = process_pipeline.get_basic_tree_pipeline(
        ('clf', LogisticRegression(C=0.1, n_jobs=-1)))
    classifier.set_params(**get_optimized_params())

    if args['search_grid']:
        parameters = process_pipeline.get_tree_parameter_grid()
        parameters['clf__C'] = (1, 0.3, 0.1, 0.05, 0.01)
        evaluation.evaluate_grid_search(x_matrix,
                                        y_vector,
                                        classifier,
                                        parameters,
                                        log_file='logs/log-grid-lr')
    else:
        evaluation.deep_evaluate(x_matrix, y_vector, classifier)
예제 #15
0
def main():
    """Script main function"""
    args = utils.read_arguments(__doc__)

    # Read dataset
    x_matrix, y_vector = utils.pickle_from_file(args['input_filename'])

    classifier = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 3), max_features=10**4)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB(alpha=2)),
    ])
    evaluation.evaluate(x_matrix, y_vector, classifier)

    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
        'vect__max_features': [10**3, 10**4, 10**5],
        'tfidf__use_idf': (True, False),
        'clf__alpha': (1, 2, 5, 10),
    }
    evaluation.evaluate_grid_search(x_matrix, y_vector, classifier, parameters)
예제 #16
0
def main(source_filename, output_filename):

    entities = set(
        [uri[2:] for uri in utils.pickle_from_file(source_filename)])

    # Read previous downloaded
    seen_uris = set()
    if os.path.exists(output_filename):
        with jsonlines.open(output_filename) as reader:
            for object in reader:
                seen_uris.add(object.get('uri'))

    entities = entities.difference(seen_uris)
    # Write missing entities
    with jsonlines.open(output_filename, mode='w') as writer:
        for uri in tqdm(entities):
            try:
                labels = get_labels(uri)
            except (urllib.error.HTTPError, EndPointNotFound) as error:
                logging.error('Error for uri {}'.format(uri))
                logging.error(error)
            writer.write({'uri': uri, 'labels': labels})
                    help="Measure of predictiveness to use",
                    default='auc')
parser.add_argument("--output-dir",
                    type=str,
                    help="Where to save off shapley values",
                    default='../results/')
parser.add_argument("--estimator-type",
                    type=str,
                    help="estimator to fit",
                    default="nn")
args = parser.parse_args()
print("Running " + args.estimator_type + " for VIM measure " + args.measure)
## --------------------------------------------------
## load the data, set up
## --------------------------------------------------
data = uts.pickle_from_file(args.dataset)
p = data.x_train.shape[1]
np.random.seed(args.seed)
folds_outer = np.random.choice(a=np.arange(2),
                               size=data.y_train.shape[0],
                               replace=True,
                               p=np.array([0.25, 0.75]))
data_0 = dg.Dataset(x_train=data.x_train[folds_outer == 0, :],
                    y_train=data.y_train[folds_outer == 0],
                    x_test=None,
                    y_test=None)
data_1 = dg.Dataset(x_train=data.x_train[folds_outer == 1, :],
                    y_train=data.y_train[folds_outer == 1],
                    x_test=None,
                    y_test=None)
cc_all = (np.sum(np.isnan(data.x_train), axis=1) == 0)
def load_dataset(filename):
    pickled_object = utils.pickle_from_file(filename)
    return (pickled_object['embeddings'], pickled_object['mappings'],
            pickled_object['data'], pickled_object['datasets'])
예제 #19
0
#!/usr/local/bin/python3

## turn icu dataset into a csv for use in R

# required functions and libraries
import utils as uts
import numpy as np
import sys
import pandas as pd

# load the dataset
out = uts.pickle_from_file(sys.argv[1] + '/icu_data_processed.pkl')

# turn it into a csv
x_train, y_train, x_test, y_test = pd.DataFrame(data=out.x_train, index=np.array(range(1, out.x_train.shape[0] + 1)), columns=np.array(range(1, out.x_train.shape[1] + 1))), pd.DataFrame(data=out.y_train, index=np.array(range(1, out.y_train.shape[0] + 1)), columns=np.array(range(1, out.y_train.shape[1] + 1))), pd.DataFrame(data=out.x_test, index=np.array(range(1, out.x_test.shape[0] + 1)), columns=np.array(range(1, out.x_test.shape[1] + 1))), pd.DataFrame(data=out.y_test, index=np.array(range(1, out.y_test.shape[0] + 1)), columns=np.array(range(1, out.y_test.shape[1] + 1)))
x_train.to_csv(sys.argv[1] + '/icu_data_processed_xtrain.csv')
y_train.to_csv(sys.argv[1] + '/icu_data_processed_ytrain.csv')
x_test.to_csv(sys.argv[1] + '/icu_data_processed_xtest.csv')
y_test.to_csv(sys.argv[1] + '/icu_data_processed_ytest.csv')