def get_datasets_political_parties(path='data/political-data/'):
    """
    Loads Political party data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    arr = os.listdir(path)
    print(arr)
    datasets = dict()
    class_value = 0
    datasets['data'] = []
    datasets['target'] = []
    datasets['target_names'] = []

    for input_file in arr:
        read_file = path + input_file
        data = list(open(read_file, "r").readlines())
        data = [s.strip() for s in data]
        target = [class_value for x in data]
        datasets['data'].append(data)
        datasets['target'].append(target)
        datasets['target_names'].append(input_file)
        class_value = class_value + 1

    datasets['data'] = utils.flatten_list(datasets['data'])
    datasets['target'] = utils.flatten_list(datasets['target'])
    datasets['target_names'] = datasets['target_names']
    print('The Target Names: ', datasets['target_names'])
    return datasets
Exemplo n.º 2
0
 def create_d2d_sparse_matrix(i2d, drug_to_interactions):
     d2i = array_to_dict(i2d)
     number_of_drugs = len(d2i)
     print('creating matrix')
     rows = flatten_list([[d2i[x[0]]] * len(x[1])
                          for x in sorted(drug_to_interactions.items())])
     cols = [
         d2i[t] for t in flatten_list(
             [x[1] for x in sorted(drug_to_interactions.items())])
     ]
     print('number of valid interactions:', len(cols))
     assert len(rows) == len(cols)
     data = [1] * len(cols)
     m = csr_matrix((data, (rows, cols)),
                    shape=(number_of_drugs, number_of_drugs),
                    dtype='f')
     print('m shape:', m.shape, 'm non zeros:', m.nnz)
     m = m.todense()
     count_non_sym = 0
     for i in range(m.shape[0]):
         for j in range(i + 1, m.shape[0]):
             if m[i, j] != m[j, i]:
                 count_non_sym += 1
             m[i, j] = max(m[i, j], m[j, i])
             m[j, i] = m[i, j]
     print('non sym count (matrix was made sym using max):', count_non_sym)
     assert np.allclose(m, m.T, atol=1e-8)  #matrix is symmetric
     return m
Exemplo n.º 3
0
def get_tags_list(df_path):
    """
    get list of BIO tags.

    Arg:
      df_path: data path 
    """

    train_df = pd.read_csv(df_path + 'train_df_opinion.tsv', delimiter='\t')
    dev_df = pd.read_csv(df_path + 'dev_df_opinion.tsv', delimiter='\t')
    test_syn_df = pd.read_csv(df_path + "test_syn_df_opinion.tsv",
                              delimiter='\t')
    test_dia_df = pd.read_csv(df_path + "test_dia_df_opinion.tsv",
                              delimiter='\t')

    # concatenate data frames
    full_df = pd.concat([train_df, dev_df, test_syn_df, test_dia_df])

    # prepare labels
    _, entities = prep_df(full_df)
    full_df = bio_tagging_df(full_df)
    labels = full_df.bio_tags.values
    labels_unlist = [list(chain.from_iterable(lab)) for lab in labels]
    labels_flat = [flatten_list(lab) for lab in labels_unlist]

    # create tags
    tag_values = [list(set(tag)) for tag in labels_flat]
    tag_values = list(set(flatten_list(tag_values)))
    tag_values.append('PAD')
    tag2idx = {t: i for i, t in enumerate(tag_values)}

    return tag_values, tag2idx, entities
Exemplo n.º 4
0
def get_sentences_biotags(tokenizer, sentences, labels, max_len):
    '''
    get tokenized flattened sentences and BIO tags.

    Args:
      sentences: text column from data
      labels: label column from data
      max_len: maximal sequence length
    '''

    sentences_unlist = [list(chain.from_iterable(sent)) for sent in sentences]
    labels_unlist = [list(chain.from_iterable(lab)) for lab in labels]
    sentences_flat = [flatten_list(sent) for sent in sentences_unlist]
    labels_flat = [flatten_list(lab) for lab in labels_unlist]

    tokenized_texts_and_labels = [
        tokenize_and_preserve_labels(tokenizer, sent, labs, max_len)
        for sent, labs in zip(sentences_flat, labels_flat)
    ]
    tokenized_texts = [
        token_label_pair[0] for token_label_pair in tokenized_texts_and_labels
    ]
    tokenized_labels = [
        token_label_pair[1] for token_label_pair in tokenized_texts_and_labels
    ]

    return tokenized_texts, tokenized_labels
Exemplo n.º 5
0
 def validation_epoch_end(self, outputs, prefix="val") -> Dict:
     self.step_count += 1
     losses = {
         k: torch.stack([x[k] for x in outputs]).mean()
         for k in self.loss_names
     }
     loss = losses["loss"]
     rouges = {
         k: np.array([x[k] for x in outputs]).mean()
         for k in ROUGE_KEYS + ["gen_time", "summ_len"]
     }
     rouge_tensor: torch.FloatTensor = torch.tensor(
         rouges["rouge2"]).type_as(loss)
     rouges.update({k: v.item() for k, v in losses.items()})
     losses.update(rouges)
     metrics = {f"{prefix}_avg_{k}": x for k, x in losses.items()}
     metrics["step_count"] = self.step_count
     self.save_metrics(metrics, prefix)  # writes to self.metrics_save_path
     preds = flatten_list([x["preds"] for x in outputs])
     target = flatten_list([x["target"] for x in outputs])
     return {
         "log": metrics,
         "preds": preds,
         f"{prefix}_loss": loss,
         f"{prefix}_rouge": rouge_tensor,
         "target": target,
     }
def leave_one_group_out_cv_single_time_point(X,
                                             y,
                                             group_names,
                                             train_predict_fn,
                                             use_features=None):
    if not isinstance(X, pd.DataFrame) or not isinstance(y, pd.DataFrame):
        raise KeyError(
            "leave_one_group_out_cv expects X and y to be data frames.")
    groups = np.unique(group_names)
    target_collection = []
    predicted_probs_collection = []
    predicted_class_collection = []
    trial_id_collection = []
    group_id_collection = []
    for group in groups:
        # Subset X and y
        train_set = X[X['group'] != group]
        test_set = X[X['group'] == group]
        train_labels = y[y['group'] != group]
        test_labels = y[y['group'] == group]

        # Add trial info to collection
        trial_id_collection.append(list(test_set["Trial"]))

        # Extract sensors
        if use_features is not None:
            train_set = train_set.loc[:, use_features]
            test_set = test_set.loc[:, use_features]

        # Convert to numpy arrays
        X_train = np.asarray(train_set)
        X_test = np.asarray(test_set)
        y_train = np.asarray(train_labels["label"])
        y_test = np.asarray(test_labels["label"])

        # Fit model and predict test set
        predicted_probs, predicted_class = train_predict_fn(X_train=X_train,
                                                            X_test=X_test,
                                                            y_train=y_train)

        # Append to collections
        target_collection.append(y_test)
        predicted_probs_collection.append(predicted_probs)
        predicted_class_collection.append(predicted_class)
        group_id_collection.append([group] * len(predicted_class))
    return pd.DataFrame({
        "Group":
        flatten_list(group_id_collection),
        "Trial":
        flatten_list(trial_id_collection),
        "Target":
        flatten_list(target_collection),
        "Predicted Probability":
        flatten_list(predicted_probs_collection),
        "Predicted Class":
        flatten_list(predicted_class_collection)
    })
Exemplo n.º 7
0
def get_all_words_in_path(path):
    trees = get_trees(path)
    word_list = flatten_list([get_all_names(t) for t in trees])
    word_names = remove_magic(word_list)

    def split_snake_case_name_to_words(name):
        return [n for n in name.split('_') if n]

    return flatten_list([
        split_snake_case_name_to_words(word_name) for word_name in word_names
    ])
Exemplo n.º 8
0
    def create_subject_arrays(self, double_precision=True   ):
        '''
            Create arrays with errors per subject and per num_target
            also create an array with the precision per subject and num_target directly
        '''

        unique_subjects = np.unique(self.dataset['subject'])
        unique_n_items = np.unique(self.dataset['n_items'])

        self.dataset['errors_subject_nitems'] = np.empty((unique_subjects.size, unique_n_items.size), dtype=np.object)
        self.dataset['errors_all_subject_nitems'] = np.empty((unique_subjects.size, unique_n_items.size), dtype=np.object)
        self.dataset['errors_nontarget_subject_nitems'] = np.empty((unique_subjects.size, unique_n_items.size), dtype=np.object)
        self.dataset['precision_subject_nitems_bays'] = np.nan*np.empty((unique_subjects.size, unique_n_items.size))
        self.dataset['precision_subject_nitems_theo'] = np.nan*np.empty((unique_subjects.size, unique_n_items.size))
        self.dataset['precision_subject_nitems_theo_nochance'] = np.nan*np.empty((unique_subjects.size, unique_n_items.size))
        self.dataset['precision_subject_nitems_bays_notreatment'] = np.nan*np.empty((unique_subjects.size, unique_n_items.size))

        self.dataset['response_subject_nitems'] = np.empty((unique_subjects.size, unique_n_items.size), dtype=np.object)
        self.dataset['item_angle_subject_nitems'] = np.empty((unique_subjects.size, unique_n_items.size), dtype=np.object)

        for n_items_i, n_items in enumerate(unique_n_items):
            for subject_i, subject in enumerate(unique_subjects):
                ids_filtered = ((self.dataset['subject']==subject) & (self.dataset['n_items'] == n_items) & (self.dataset.get('masked', False) == False)).flatten()


                # Get the errors
                self.dataset['errors_subject_nitems'][subject_i, n_items_i] = self.dataset['errors_all'][ids_filtered, 0]
                self.dataset['errors_all_subject_nitems'][subject_i, n_items_i] = self.dataset['errors_all'][ids_filtered]
                self.dataset['errors_nontarget_subject_nitems'][subject_i, n_items_i] = self.dataset['errors_all'][ids_filtered, 1:]

                # Get the responses and correct item angles
                self.dataset['response_subject_nitems'][subject_i, n_items_i] = self.dataset['response'][ids_filtered]
                self.dataset['item_angle_subject_nitems'][subject_i, n_items_i] = self.dataset['item_angle'][ids_filtered]

                # Compute the precision
                self.dataset['precision_subject_nitems_bays'][subject_i, n_items_i] = self.compute_precision(self.dataset['errors_subject_nitems'][subject_i, n_items_i], remove_chance_level=True, correct_orientation=True, use_wrong_precision=True)
                self.dataset['precision_subject_nitems_theo'][subject_i, n_items_i] = self.compute_precision(self.dataset['errors_subject_nitems'][subject_i, n_items_i], remove_chance_level=False, correct_orientation=True, use_wrong_precision=False)
                self.dataset['precision_subject_nitems_theo_nochance'][subject_i, n_items_i] = self.compute_precision(self.dataset['errors_subject_nitems'][subject_i, n_items_i], remove_chance_level=True, correct_orientation=False, use_wrong_precision=False)
                self.dataset['precision_subject_nitems_bays_notreatment'][subject_i, n_items_i] = self.compute_precision(self.dataset['errors_subject_nitems'][subject_i, n_items_i], remove_chance_level=False, correct_orientation=True, use_wrong_precision=True)

        # if double_precision:
        #     precision_subject_nitems *= 2.
        #     precision_subject_nitems_theo *= 2.
        #     # self.dataset['precision_subject_nitems_theo_nochance'] *= 2.
        #     self.dataset['precision_subject_nitems_bays_notreatment'] *= 2.


        self.dataset['errors_nitems'] = np.array([utils.flatten_list(self.dataset['errors_subject_nitems'][:, n_item_i]) for n_item_i in xrange(unique_n_items.size)])
        self.dataset['errors_all_nitems'] = np.array([utils.flatten_list(self.dataset['errors_all_subject_nitems'][:, n_item_i]) for n_item_i in xrange(unique_n_items.size)])
        self.dataset['errors_nontarget_nitems'] = self.dataset['errors_all_nitems'][:, :, 1:]
        self.dataset['precision_nitems_bays'] = np.mean(self.dataset['precision_subject_nitems_bays'], axis=0)
        self.dataset['precision_nitems_theo'] = np.mean(self.dataset['precision_subject_nitems_theo'], axis=0)
        self.dataset['precision_nitems_theo_nochance'] = np.mean(self.dataset['precision_subject_nitems_theo_nochance'], axis=0)
        self.dataset['precision_nitems_bays_notreatment'] = np.mean(self.dataset['precision_subject_nitems_bays_notreatment'], axis=0)
Exemplo n.º 9
0
def get_all_words_in_path(path):
    """Returns list of all words"""
    trees = [t for t in get_trees(path) if t]
    function_names = [
        f for f in flatten_list([get_all_names(t) for t in trees])
        if not (f.startswith('__') and f.endswith('__'))
    ]

    def split_snake_case_name_to_words(name):
        return [n for n in name.split('_') if n]

    return flatten_list([
        split_snake_case_name_to_words(function_name)
        for function_name in function_names
    ])
Exemplo n.º 10
0
    def fix_sentence(self, s_tripleset, template, tag2ent):
        ent2tags = {v: k for k, v in tag2ent.items()}

        # s_tripleset must meet "head && tail are in template && tag2ent"
        bad_triples = set()
        for triple_ix, triple in enumerate(s_tripleset):
            for ent in [triple[0], triple[-1]]:
                if ent in ent2tags:
                    if ent2tags[ent] not in template:
                        bad_triples.add(triple_ix)
                        continue
                else:
                    bad_triples.add(triple_ix)
                    continue
        s_tripleset = [
            triple for triple_ix, triple in enumerate(s_tripleset)
            if triple_ix not in bad_triples
        ]

        # tag2ent are entities only in triple_entities
        triple_entities = set(
            flatten_list([(triple[0], triple[-1]) for triple in s_tripleset]))
        tag2tri_ent = {
            k: v
            for k, v in tag2ent.items() if v in triple_entities
        }

        # templates only have triple_entities
        for tag, ent in tag2ent.items():
            if ent not in triple_entities:
                ent = ent.replace('_', ' ')
                template = template.replace(tag, ent)

        if {word for word in template.split()
            if 'AGENT' in word or 'BRIDGE' in word or 'PATIENT' in word} \
                != set(tag2tri_ent.keys()):
            self.cnt_corefs += 1
        assert set(tag2tri_ent.values()) == triple_entities
        '''
        TODO: 
        Erroraneous case:
        train.csv:7123:"Ayam penyet	mainIngredients	Squeezed"" or ""smashed"" fried chicken served with sambal",PATIENT_2 is PATIENT_3 .,"Fried chicken is Squeezed"" or ""smashed"" fried chicken served with sambal .",The chicken is smashed and served hot with sambal .,"Ayam penyet	Fried chicken	Squeezed"" or ""smashed"" fried chicken served with sambal",AGENT_1 PATIENT_2 PATIENT_3,ROOT	mainIngredients	mainIngredients_inv,mainIngredients,"[0, 2]","[2, 2, 8]","{""AGENT_1"": ""Ayam penyet"", ""PATIENT_2"": ""Fried chicken"", ""PATIENT_3"": ""Squeezed\"" or \""smashed\"" fried chicken served with sambal""}","[[0, 4], [4, 2], [2, 5], [5, 0]]","Ayam penyet <ENT_SEP> Fried chicken <ENT_SEP> Squeezed"" or ""smashed"" fried chicken served with sambal <ENT_REL_SEP> mainIngredients <REL_TRP_SEP> 0 2 0","Ayam penyet	mainIngredients	Squeezed"" or ""smashed"" fried chicken served with sambal <ENT_TGT_SEP> PATIENT_2 is PATIENT_3 . <TGT_TXT_SEP> The chicken is smashed and served hot with sambal ."
        train.csv:7359:Bakewell tart	ingredient	Frangipane,AGENT_1 contains PATIENT_3 .,Bakewell pudding contains Frangipane .,It contains frangipane .,Bakewell pudding	Bakewell tart	Frangipane,AGENT_1 BRIDGE_2 PATIENT_3,ROOT	ingredient	ingredient_inv,ingredient,"[1, 2]","[2, 2, 1]","{""AGENT_1"": ""Bakewell pudding"", ""BRIDGE_2"": ""Bakewell tart"", ""PATIENT_3"": ""Frangipane""}","[[1, 4], [4, 2], [2, 5], [5, 1]]",Bakewell pudding <ENT_SEP> Bakewell tart <ENT_SEP> Frangipane <ENT_REL_SEP> ingredient <REL_TRP_SEP> 1 2 0,Bakewell tart	ingredient	Frangipane <ENT_TGT_SEP> AGENT_1 contains PATIENT_3 . <TGT_TXT_SEP> It contains frangipane .
        {
            "sent": "demarce short stories in the the grantville gazettes precede eric flint novels .",
            "graph": [
                {
                    "truth": "precededBy",
                    "pred": "precededBy",
                    "ent0_ent1": "1634: the bavarian crisis ENT0_END demarce short stories in the the grantville gazettes"
                },
                {
                    "truth": "<unk>",
                    "pred": "author",
                    "ent0_ent1": "1634: the bavarian crisis ENT0_END eric flint"
                }
            ]
        }
        '''
        return s_tripleset, template, tag2tri_ent
Exemplo n.º 11
0
 def recurse_files(self, folder):
     if isdir(folder):
         return flatten_list([
             self.recurse_files(folder + '/' + f) for f in listdir(folder)
             if not f.startswith('.')
         ])
     return [folder]
Exemplo n.º 12
0
 def validation_epoch_end(self, outputs, prefix="val") -> Dict:
     self.step_count += 1
     losses = {
         k: torch.stack([x[k] for x in outputs]).mean()
         for k in self.loss_names
     }
     loss = losses["loss"]
     generative_metrics = {
         k: np.array([x[k] for x in outputs]).mean()
         for k in self.metric_names + ["gen_time", "gen_len"]
     }
     metric_val = (generative_metrics[self.val_metric] if self.val_metric
                   in generative_metrics else losses[self.val_metric])
     metric_tensor: torch.FloatTensor = torch.tensor(metric_val).type_as(
         loss)
     generative_metrics.update({k: v.item() for k, v in losses.items()})
     losses.update(generative_metrics)
     all_metrics = {f"{prefix}_avg_{k}": x for k, x in losses.items()}
     all_metrics["step_count"] = self.step_count
     self.metrics[prefix].append(
         all_metrics)  # callback writes this to self.metrics_save_path
     preds = flatten_list([x["preds"] for x in outputs])
     return {
         "log": all_metrics,
         "preds": preds,
         f"{prefix}_loss": loss,
         f"{prefix}_{self.val_metric}": metric_tensor,
     }
Exemplo n.º 13
0
    def validation_epoch_end(self, outputs, prefix="val") -> Dict:
        self.step_count += 1
        losses = {
            k: torch.stack([x[k] for x in outputs]).mean()
            for k in self.loss_names
        }
        loss = losses["loss"]
        gen_metrics = {
            k: np.array([x[k] for x in outputs]).mean()
            for k in self.metric_names + ["gen_time", "gen_len"]
        }
        metrics_tensor: torch.FloatTensor = torch.tensor(
            gen_metrics[self.val_metric]).type_as(loss)
        gen_metrics.update({k: v.item() for k, v in losses.items()})

        # fix for https://github.com/PyTorchLightning/pytorch-lightning/issues/2424
        if dist.is_initialized():
            dist.all_reduce(metrics_tensor, op=dist.ReduceOp.SUM)
            metrics_tensor = metrics_tensor / dist.get_world_size()
            gen_metrics.update({self.val_metric: metrics_tensor.item()})

        losses.update(gen_metrics)
        metrics = {f"{prefix}_avg_{k}": x for k, x in losses.items()}
        metrics["step_count"] = self.step_count
        self.save_metrics(metrics, prefix)  # writes to self.metrics_save_path
        preds = flatten_list([x["preds"] for x in outputs])
        return {
            "log": metrics,
            "preds": preds,
            f"{prefix}_loss": loss,
            f"{prefix}_{self.val_metric}": metrics_tensor
        }
Exemplo n.º 14
0
def cross_validate_time_point(X,
                              y,
                              trial_folds,
                              train_predict_fn,
                              use_features=None):
    folds = np.unique(trial_folds)
    target_collection = []
    predicted_probs_collection = []
    predicted_class_collection = []
    trial_id_collection = []
    fold_collection = []
    for fold in folds:
        train_indices = np.where(trial_folds != fold)[0]
        test_indices = np.where(trial_folds == fold)[0]
        if isinstance(X, pd.DataFrame):
            if use_features is not None:
                X = X.loc[:, use_features]
            X_train = np.asarray(X.iloc[train_indices])
            X_test = np.asarray(X.iloc[test_indices])
        else:
            X_train = X[train_indices]
            X_test = X[test_indices]
        y_train = y[train_indices]
        y_test = y[test_indices]
        # Fit model and predict test set
        predicted_probs, predicted_class = train_predict_fn(X_train=X_train,
                                                            X_test=X_test,
                                                            y_train=y_train)
        # Append to collections
        trial_id_collection.append(test_indices)
        target_collection.append(y_test)
        predicted_probs_collection.append(predicted_probs)
        predicted_class_collection.append(predicted_class)
        fold_collection.append([fold] * len(test_indices))
    return pd.DataFrame({
        "Fold":
        flatten_list(fold_collection),
        "Trial":
        flatten_list(trial_id_collection),
        "Target":
        flatten_list(target_collection),
        "Predicted Probability":
        flatten_list(predicted_probs_collection),
        "Predicted Class":
        flatten_list(predicted_class_collection)
    })
Exemplo n.º 15
0
    def __init__(self, set: DataSetType):
        self.data_set_type = set.value
        files = self.recurse_files(
            path.join(path.dirname(path.realpath(__file__)), "raw", set.value))
        data = flatten_list([RDFFileReader(f).data for f in files])

        super().__init__(data,
                         misspelling=misspelling,
                         rephrase=(rephrase, rephrase_if_must))
Exemplo n.º 16
0
    def get_membership(self, partition_vector=None, flatten=False):

        pvec = partition_vector or self.partition_vector
        result = defaultdict(list)
        for (position, value) in enumerate(pvec):
            result[value].append(position)
        result = [tuple(x) for x in sorted(result.values(), key=len,
                                           reverse=True)]
        return flatten_list(result) if flatten else result
Exemplo n.º 17
0
    def simulate_from_result(self,
            partition_object, lsf=False,
            ntimes=1, **kwargs
        ):
        """ Simulates a set of records using parameters estimated when
        calculating concatenated trees from the Partition object """
        inds = partition_object.get_membership()

        if lsf and ntimes > 1:
            multiple_results = [self.simulate(ind, lsf=lsf, ntimes=ntimes)
                                for ind in inds]
            return [flatten_list(result)
                       for result in zip(*multiple_results)]

        else:
            return [flatten_list([self.simulate(ind, **kwargs)
                                  for ind in inds])
                    for _ in range(ntimes)]
Exemplo n.º 18
0
    def pos_tags(self):
        """Return part-of-speech tags, for the entire document.

        >>> Analysis("I am fine. How are you?").pos_tags()
        ... # doctest: +NORMALIZE_WHITESPACE
        [('I', 'PRP'), ('am', 'VBP'), ('fine', 'NN'), ('.', '.'),
        ('How', 'WRB'), ('are', 'VBP'), ('you', 'PRP'), ('?', '.')]
        """
        return flatten_list(self.pos_tags_by_sentence())
Exemplo n.º 19
0
    def get_membership(self, partition_vector=None, flatten=False):

        pvec = partition_vector or self.partition_vector
        result = defaultdict(list)
        for (position, value) in enumerate(pvec):
            result[value].append(position)
        result = [
            tuple(x) for x in sorted(result.values(), key=len, reverse=True)
        ]
        return flatten_list(result) if flatten else result
Exemplo n.º 20
0
def get_verbs_in_path(path):
    """Returns list of all verbs"""
    trees = [t for t in get_trees(path) if t]
    list_of_nodes = get_nodes(trees)
    fncs = get_function_names(list_of_nodes)
    print('%s functions extracted' % len(fncs))

    verbs = flatten_list([
        get_verbs_from_function_name(function_name) for function_name in fncs
    ])
    return verbs
Exemplo n.º 21
0
def get_all_mushrooms_ids():
    """Gets all mushrooms identifiers"""
    mushrooms_ids = []

    for f in get_families():
        for g in get_genres_per_family(f):
            mushrooms_ids.append(get_mushrooms_per_genre(g))

    mushrooms_ids = utils.flatten_list(mushrooms_ids)

    return mushrooms_ids
def export_compatibility_list():
    with os.scandir('output/compatibility_list/raw') as it:
        for entry in it:
            if entry.name.endswith('.json'):
                data = load_json(entry.path)

                output_data = []

                with open(
                        'output/compatibility_list/' +
                        entry.name.replace('.json', '.txt'), 'w') as f:
                    f.write(flatten_list(output_data))
Exemplo n.º 23
0
 def find_good_lines(self, line, lines, z=0, separation=0, redge=0.05, ledge=None):
     """
     Find any good lines in the Spectrum.
     
     Parameters
     ----------
     line : :obj:`str`
            Search for good lines of this kind.
     lines : :obj:`str`
            Compare against this kind of lines.
     z : :obj:`float`
            Redshift correction to apply to the rest frequencies.
     separation : :obj:`float`
             Minimum separation between lines to be considered good.
     redge : :obj:`float`
             The line frequency should be this far 
     """
     
     # If no value is given for the left edge, use the same as for the right edge
     if ledge == None:
         ledge = redge
     
     # Find the lines within the Spectrum corresponding to the desired line
     ns, rf = self.find_lines(line, z)
     
     # Find other lines in the Spectrum
     ofs = []
     for l in lines:
         n, f = self.find_lines(l, z)
         ofs.append(list(f))
     
     fofs = utils.flatten_list(ofs)
     
     # Loop over lines checking that their separation from the other lines
     # is larger than separation.
     for i,f in enumerate(rf):
         diff = [abs(of - f) if of != f else separation+1 for of in fofs]
         if all(d > separation for d in diff) and \
             f >= self.x.compressed().min() + self.bw*ledge and \
             f <= self.x.compressed().max() - self.bw*redge:
             try:
                 self.good_lines[line].append(ns[i])
                 self.good_lines[line+'_freq'].append(rf[i])
             except KeyError:
                 self.good_lines[line] = [ns[i]]
                 self.good_lines[line+'_freq'] = [rf[i]]
     try:
         self.good_lines[line]
         self.good_lines[line+'_freq']
     except KeyError:
         self.good_lines[line] = []
         self.good_lines[line+'_freq'] = []
Exemplo n.º 24
0
def get_df(sols_lst, panel, noise):
    "Extract dataframe with fpr and tpr from list of solutions"
    l = [(fpr(sol.imap, REF_IMAP), tpr(sol.imap, REF_IMAP))
         for sol in utils.flatten_list(sols_lst)]
    # Sort base on fpr
    l.sort(key=lambda tup: tup[0])
    x, y = zip(*l)
    df = pd.DataFrame({
        "fpr": x,
        "tpr": y,
        "panel": [panel] * len(x),
        "noise": [noise] * len(x)
    })
    return df
def get_datasets_political_parties(path='data/tobacco_full/'):
    """
    Loads Political party data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    arr = sorted(os.listdir(path))
    print('arr', arr)
    datasets = dict()
    class_value = 0
    datasets['data'] = []
    datasets['target'] = []
    datasets['target_names'] = []

    for input_file in arr:
        read_file = path + input_file
        data = list(open(read_file, "r").readlines())
        print('Data in each file', input_file, len(data))
        data = [s.strip() for s in data
                if len(s.strip()) > 0]  # ignoring empty lines
        target = [class_value for x in data]
        datasets['data'].append(data)

        datasets['target'].append(target)
        datasets['target_names'].append(input_file)
        class_value = class_value + 1

# print('The Data before flattening: ', datasets['data'])nvi
    datasets['data'] = utils.flatten_list(datasets['data'])
    datasets['target'] = utils.flatten_list(datasets['target'])
    #datasets['target_names'] = datasets['target_names']
    #print('The Data : ', datasets['data'])
    #print('The Target : ', datasets['target'])
    #print(len(datasets['data']))
    #print(len(datasets['target']))
    #print('The Target Names: ', datasets['target_names'])

    return datasets
Exemplo n.º 26
0
 def find_good_lines(self, line, lines, z=0, separation=0, redge=0.05, ledge=None):
     """
     Find any good lines in the Spectrum.
     
     Parameters
     ----------
     line : :obj:`str`
            Search for good lines of this kind.
     lines : :obj:`str`
            Compare against this kind of lines.
     z : :obj:`float`
            Redshift correction to apply to the rest frequencies.
     separation : :obj:`float`
             Minimum separation between lines to be considered good.
     redge : :obj:`float`
             The line frequency should be this far 
     """
     
     # If no value is given for the left edge, use the same as for the right edge
     if ledge == None:
         ledge = redge
     
     # Find the lines within the Spectrum corresponding to the desired line
     ns, rf = self.find_lines(line, z)
     
     # Find other lines in the Spectrum
     ofs = []
     for l in lines:
         n, f = self.find_lines(l, z)
         ofs.append(list(f))
     
     fofs = utils.flatten_list(ofs)
     
     # Loop over lines checking that their separation from the other lines
     # is larger than separation.
     for i,f in enumerate(rf):
         diff = [abs(of - f) if of != f else separation+1 for of in fofs]
         if all(d > separation for d in diff) and \
             f >= self.x.compressed().min() + self.bw*ledge and \
             f <= self.x.compressed().max() - self.bw*redge:
             try:
                 self.good_lines[line].append(ns[i])
             except KeyError:
                 self.good_lines[line] = [ns[i]]
                 
     try:
         self.good_lines[line]
     except KeyError:
         self.good_lines[line] = []
Exemplo n.º 27
0
def set_run_transforms(run):
    ts = []
    spatial = []
    transforms = []

    for model in run['models']:
        model['transforms_nested'] = {
            'ts' : [],
            'spatial' : [],
            'transform' : []
        }
        for transform in model['transforms']:
            model['transforms_nested'][transform['type']].append(transform)
        #print(json.dumps(model['transforms_nested'], indent=2))


    for model in run['models']:
        ts.append(model['transforms_nested']['ts'])
        spatial.append(model['transforms_nested']['spatial'])
        transforms.append(model['transforms_nested']['transform'])
    ts = utils.flatten_list(ts)
    spatial = utils.flatten_list(spatial)
    transforms = utils.flatten_list(transforms)
    ts = utils.drop_duplicates_from_list_of_dicts(ts)
    spatial = utils.drop_duplicates_from_list_of_dicts(spatial)
    transforms = utils.drop_duplicates_from_list_of_dicts(transforms)

    for transform in ts+spatial+transforms:
        del transform['type']

    run['transforms'] = {
        'ts' : ts,
        'spatial' : spatial,
        'transforms' : transforms
    }
    return run
Exemplo n.º 28
0
def is_field(_td, _fields, verbose=False):
    '''
    This function checks whether fields are in a dict.
    
    Parameters
    ----------
    _td : dict / list of dict
        dict of trial data.
    _fields : str / list of str
        Fields in the trial data dict.
    verbose : bool, optional
        Describe what's happening in the code. The default is False.

    Returns
    -------
    return_val : bool
        Return whether fields are in the trial data dict or not.

    '''

    return_val = True

    # check dict input variable
    if type(_td) is dict:
        _td = [_td]

    if type(_td) is not list:
        raise Exception('ERROR: _td must be a list of dictionaries!')

    # check string input variable
    if type(_fields) is str:
        _fields = [_fields]

    if type(_fields) is not list:
        raise Exception('ERROR: _str must be a list of strings!')

    # Flatten list of fields
    _fields = flatten_list(_fields)

    for idx, td_tmp in enumerate(_td):
        for field in _fields:
            if field not in td_tmp.keys():
                return_val = False
                if verbose:
                    print('Field {} not in dict #{}'.format(field, idx))

    return return_val
Exemplo n.º 29
0
    def get_labels(self):
        """Gets the list of unique labels for this dataset

        Returns
        -------
        label_list : list of strings
            Sorted (alphabetical/numerical) list of labels as strings
        """
        if self.labels or not self.labelled_data:
            return self.labels
        else:
            all_labels = []
            # Could add in _test.csv but later might use meta learning so some text labels we
            # don't want to train/evaluate on...
            for set_type in ["_train.csv", "_dev.csv"]:
                file_path = os.path.join(self.data_dir,
                                         self.data_name + set_type)
                _, _, labels = self._read_csv_or_df(file_path)
                all_labels += flatten_list(labels)
            # Return the sorted unique values (as strings) of all the labels
            # Note that list(set(all_labels)) returns the unique labels as a list
            return sorted(list(map(str, list(set(all_labels)))))
def get_overlaps(chunks):
  chunks = [list(chunk) for chunk in chunks]
  # get overlaps of +/- 1 token
  # create temp chunks
  chunks_plus1_starts = copy.deepcopy(chunks)
  chunks_minus1_starts = copy.deepcopy(chunks)
  chunks_plus1_ends = copy.deepcopy(chunks)
  chunks_minus1_ends = copy.deepcopy(chunks)

  for i, chunk in enumerate(chunks):
    chunks_plus1_starts[i][1] = int(chunk[1] + 1)
    chunks_plus1_ends[i][2] = int(chunk[2] + 1)
    # pay attention with - 1
    if chunk[1] != chunk[2]:
      chunks_minus1_starts[i][1] = int(chunk[1] - 1)
      chunks_minus1_ends[i][2] = int(chunk[2] - 1)
  
  chunks_overlap = [tuple(chunk) for chunk in 
                    flatten_list([chunks, chunks_plus1_starts, chunks_minus1_starts, 
                                  chunks_plus1_ends, chunks_minus1_ends])]

  return chunks_overlap
Exemplo n.º 31
0
    def Make(self, *code, **kwargs):
        """
        """

        _return_type = None
        flatten = None

        if '_return_type' in kwargs:
            _return_type = kwargs['_return_type']
            del kwargs['_return_type']

        if 'flatten' in kwargs:
            flatten = kwargs['flatten']
            del kwargs['flatten']

        g, refs = dsl.Compile(code, self._refs)
        f = utils.compose2(g, self)
        flatten_f = lambda x: utils.flatten_list(x) if type(x) is list else x

        if flatten:
            f = utils.compose2(flatten_f, f)

        return self.__unit__(f, refs, _return_type=_return_type)
Exemplo n.º 32
0
def plot_normalized_distribution_over_time(raw_data_df):
    fig, axes = plt.subplots(4, 3, figsize=(16, 16))
    flat_axes = utils.flatten_list(axes)
    for i, (field_name, ax) in enumerate(zip(data.FIELD_NAMES, flat_axes)):
        x1 = raw_data_df[[
            _ for _ in raw_data_df.columns
            if utils.split_field_month(_)[0] == field_name
        ]]
        x2 = x1.div(x1.sum(axis=1), axis=0)
        x3 = x2.mean()
        x3.index = pd.DatetimeIndex(x3.index.map(utils.map_to_month)) \
            .strftime("%Y-%m")
        x3.plot(kind="bar", ax=ax, color="ggg" + "b" * (len(x3) - 3))
        ax.set_title(data.FIELD_NAMES_MAP[field_name], fontsize=18)
        ax.set_xticks([])

    # Hack
    for ax in flat_axes[-3:]:
        ax.set_xticks(range(len(x3.index)))
        ax.set_xticklabels(
            [_ if i % 3 == 0 else "" for (i, _) in enumerate(x3.index)])
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=60)

    fig.suptitle("Normalized Distribution of Activity over Time", fontsize=30)
Exemplo n.º 33
0
async def scrape_lap_records(http_session: aiohttp.ClientSession,
                             track_id: int,
                             vehicle_id: int) -> List[LapRecordTuple]:
    first_soup = await _prepare_soup(http_session, track_id, vehicle_id)
    if first_soup.find("p", class_="error"):
        raise ValueError("invalid track_id and vehicle_id combination")

    number_of_pages = _get_number_of_pages(first_soup)
    if number_of_pages == 0:
        logger.debug()
        return []

    tasks = []
    for page_n in range(2, number_of_pages + 1):
        tasks.append(
            _request_and_scrape_soup(http_session, track_id, vehicle_id,
                                     page_n))
    tasks.append(_scrape_soup(first_soup))

    results = flatten_list(await asyncio.gather(*tasks))
    logger.debug(
        f"Found {len(results)} records for track={track_id} and vehicle={vehicle_id}"
    )
    return results
Exemplo n.º 34
0
def create_df_node(df: DataFrame, node_attributes) -> DataFrame:
    """
    Creating df_node from df, base on node_attributes
    :param df: dataframe
    :param node_attributes: dict, e.g. {'NodeID1': ['LABELx', 'NAME', 'ATT']},
    :return: df_node dataframe
    """
    df_node = None
    for elem in node_attributes.items():
        node_columns = [x for x in flatten_list(elem) if x]
        _nodeID = node_columns[0]
        _df_node = df.select(node_columns)

        for col in node_columns:
            _df_node = _df_node.withColumnRenamed(
                f"{col}", f"{col[:-1]}")  # "idEntity:ID" # ":LABEL"

        if df_node is None:
            df_node = _df_node
        else:
            df_node = df_node.union(_df_node)

    df_node = df_node.dropDuplicates()
    return df_node
Exemplo n.º 35
0
    def create_subject_arrays(self, double_precision=True   ):
        '''
            Create arrays with errors per subject and per num_target
            also create an array with the precision per subject and num_target directly
        '''

        unique_subjects = np.unique(self.dataset['subject'])
        unique_n_items = np.unique(self.dataset['n_items'])

        self.dataset['errors_subject_nitems_trecall'] = np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size), dtype=np.object)
        self.dataset['errors_all_subject_nitems_trecall'] = np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size), dtype=np.object)
        self.dataset['errors_nontarget_subject_nitems_trecall'] = np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size), dtype=np.object)
        self.dataset['sizes_subject_nitems_trecall'] = np.nan*np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size))
        self.dataset['precision_subject_nitems_trecall_bays'] = np.nan*np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size))
        self.dataset['precision_subject_nitems_trecall_theo'] = np.nan*np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size))
        self.dataset['precision_subject_nitems_trecall_theo_nochance'] = np.nan*np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size))
        self.dataset['precision_subject_nitems_trecall_bays_notreatment'] = np.nan*np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size))

        self.dataset['response_subject_nitems_trecall'] = np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size), dtype=np.object)
        self.dataset['item_angle_subject_nitems_trecall'] = np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size), dtype=np.object)
        self.dataset['target_subject_nitems_trecall'] = np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size), dtype=np.object)
        self.dataset['nontargets_subject_nitems_trecall'] = np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size), dtype=np.object)


        self.dataset['errors_nitems_trecall'] = np.empty((unique_n_items.size, unique_n_items.size), dtype=np.object)
        self.dataset['errors_all_nitems_trecall'] = np.empty((unique_n_items.size, unique_n_items.size), dtype=np.object)
        self.dataset['errors_nontarget_nitems_trecall'] = np.empty((unique_n_items.size, unique_n_items.size), dtype=np.object)
        self.dataset['response_nitems_trecall'] = np.empty((unique_n_items.size, unique_n_items.size), dtype=np.object)
        self.dataset['item_angle_nitems_trecall'] = np.empty((unique_n_items.size, unique_n_items.size), dtype=np.object)
        self.dataset['target_nitems_trecall'] = np.empty((unique_n_items.size, unique_n_items.size), dtype=np.object)
        self.dataset['nontargets_nitems_trecall'] = np.empty((unique_n_items.size, unique_n_items.size), dtype=np.object)

        self.dataset['precision_nitems_trecall_bays'] = np.nan*np.empty((unique_n_items.size, unique_n_items.size))
        self.dataset['precision_nitems_trecall_theo'] = np.nan*np.empty((unique_n_items.size, unique_n_items.size))
        self.dataset['precision_nitems_trecall_theo_nochance'] = np.nan*np.empty((unique_n_items.size, unique_n_items.size))
        self.dataset['precision_nitems_trecall_bays_notreatment'] = np.nan*np.empty((unique_n_items.size, unique_n_items.size))



        for n_items_i, n_items in enumerate(unique_n_items):
            for subject_i, subject in enumerate(unique_subjects):
                for trecall_i, trecall in enumerate(np.arange(1, n_items+1)):
                    ids_filtered = ((self.dataset['subject']==subject) & (self.dataset['n_items'] == n_items) & (self.dataset['probe'] == trecall) & (self.dataset.get('masked', False) == False)).flatten()

                    # Invert the order of storage, 0 -> last item probed, 1 -> second to last item probe, etc...
                    # trecall_i = n_items - trecall

                    # Get the errors
                    self.dataset['errors_all_subject_nitems_trecall'][subject_i, n_items_i, trecall_i] = self.dataset['errors_all'][ids_filtered]

                    self.dataset['errors_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], self.dataset['errors_nontarget_subject_nitems_trecall'][subject_i, n_items_i, trecall_i] = self.extract_target_nontargets_columns(self.dataset['errors_all_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], trecall)


                    # Get the responses and correct item angles
                    # TODO (lmatthey) trecall here is inverted, should really fix it somehow...
                    self.dataset['response_subject_nitems_trecall'][subject_i, n_items_i, trecall_i] = self.dataset['response'][ids_filtered].flatten()
                    self.dataset['item_angle_subject_nitems_trecall'][subject_i, n_items_i, trecall_i] = self.dataset['item_angle'][ids_filtered]

                    # Save target item and nontargets as well
                    self.dataset['target_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], self.dataset['nontargets_subject_nitems_trecall'][subject_i, n_items_i, trecall_i] = self.extract_target_nontargets_columns(self.dataset['item_angle'][ids_filtered], trecall)

                    # Get the number of samples per conditions
                    self.dataset['sizes_subject_nitems_trecall'][subject_i, n_items_i, trecall_i] = self.dataset['errors_subject_nitems_trecall'][subject_i, n_items_i, trecall_i].size

                    # Compute the precision
                    self.dataset['precision_subject_nitems_trecall_bays'][subject_i, n_items_i, trecall_i] = self.compute_precision(self.dataset['errors_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], remove_chance_level=True, correct_orientation=True, use_wrong_precision=True)
                    self.dataset['precision_subject_nitems_trecall_theo'][subject_i, n_items_i, trecall_i] = self.compute_precision(self.dataset['errors_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], remove_chance_level=False, correct_orientation=True, use_wrong_precision=False)
                    self.dataset['precision_subject_nitems_trecall_theo_nochance'][subject_i, n_items_i, trecall_i] = self.compute_precision(self.dataset['errors_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], remove_chance_level=True, correct_orientation=False, use_wrong_precision=False)
                    self.dataset['precision_subject_nitems_trecall_bays_notreatment'][subject_i, n_items_i, trecall_i] = self.compute_precision(self.dataset['errors_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], remove_chance_level=False, correct_orientation=True, use_wrong_precision=True)

        # if double_precision:
        #     precision_subject_nitems *= 2.
        #     precision_subject_nitems_theo *= 2.
        #     # self.dataset['precision_subject_nitems_theo_nochance'] *= 2.
        #     self.dataset['precision_subject_nitems_bays_notreatment'] *= 2.


        # self.dataset['errors_nitems_trecall'] = np.array([utils.flatten_list(self.dataset['errors_subject_nitems_trecall'][:, n_items_i]) for n_items_i in xrange(unique_n_items.size)])


        # Store all/average subjects data
        for n_items_i, n_items in enumerate(unique_n_items):
            for trecall_i, trecall in enumerate(np.arange(1, n_items+1)):
                self.dataset['errors_nitems_trecall'][n_items_i, trecall_i] = np.array(utils.flatten_list(self.dataset['errors_subject_nitems_trecall'][:, n_items_i, trecall_i]))
                self.dataset['errors_all_nitems_trecall'][n_items_i, trecall_i] = np.array(utils.flatten_list(self.dataset['errors_all_subject_nitems_trecall'][:, n_items_i, trecall_i]))
                self.dataset['errors_nontarget_nitems_trecall'][n_items_i, trecall_i] = np.array(utils.flatten_list(self.dataset['errors_nontarget_subject_nitems_trecall'][:, n_items_i, trecall_i]))

                # Responses, target, nontarget
                self.dataset['response_nitems_trecall'][n_items_i, trecall_i] = np.array(utils.flatten_list(self.dataset['response_subject_nitems_trecall'][:, n_items_i, trecall_i]))
                self.dataset['target_nitems_trecall'][n_items_i, trecall_i] = np.array(utils.flatten_list(self.dataset['target_subject_nitems_trecall'][:, n_items_i, trecall_i]))
                self.dataset['nontargets_nitems_trecall'][n_items_i, trecall_i] = np.array(utils.flatten_list(self.dataset['nontargets_subject_nitems_trecall'][:, n_items_i, trecall_i]))
                self.dataset['item_angle_nitems_trecall'][n_items_i, trecall_i] = np.array(utils.flatten_list(self.dataset['item_angle_subject_nitems_trecall'][:, n_items_i, trecall_i]))


                # Precision over all subjects errors (not average of precisions)
                self.dataset['precision_nitems_trecall_bays'][n_items_i, trecall_i] = self.compute_precision(self.dataset['errors_nitems_trecall'][n_items_i, trecall_i], remove_chance_level=True, correct_orientation=True, use_wrong_precision=True)

                # self.dataset['precision_nitems_trecall_bays'] = np.mean(self.dataset['precision_subject_nitems_trecall_bays'], axis=0)
                self.dataset['precision_nitems_trecall_theo'] = np.mean(self.dataset['precision_subject_nitems_trecall_theo'], axis=0)
                self.dataset['precision_nitems_trecall_theo_nochance'] = np.mean(self.dataset['precision_subject_nitems_trecall_theo_nochance'], axis=0)
                self.dataset['precision_nitems_trecall_bays_notreatment'] = np.mean(self.dataset['precision_subject_nitems_trecall_bays_notreatment'], axis=0)
Exemplo n.º 36
0
def get_list_csv(collection=coll):
    """ return flatten list from get_meta_csv"""
    return flatten_list(list(get_meta_csv(collection)))
Exemplo n.º 37
0
mapping = defaultdict(lambda: None)
mapping['ACQNO'] = lambda x: [{'tag': '100', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}]
# mapping['ACTIV'] = lambda x: [{'tag': '101', 'ind1': ' ', 'ind2': ' ', 'subs': {'a' : x}}]
mapping['AU'] =    lambda x: [{'tag': '102', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': e}} for e in split_subfields(x)]
mapping['CITED'] = lambda x: [{'tag': '103', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': e}} for e in split_subfields(x)]
mapping['COPY'] =  lambda x: [{'tag': '104', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}]
# mapping['CTI'] =   lambda x: [{'tag': '105', 'ind1': ' ', 'ind2': ' ', 'subs': {'a' : x}}]
mapping['DES'] =   lambda x: [{'tag': '106', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}]
mapping['ED'] =    lambda x: [{'tag': '107', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}]
# mapping['EXCLM'] = lambda x: [{'tag': '108', 'ind1': ' ', 'ind2': ' ', 'subs': {'a' : x}}]
mapping['EXP'] =   lambda x: [{'tag': '109', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}]
mapping['FREQ'] =  lambda x: [{'tag': '110', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': fix_FREQ.get(x, x)}}]
mapping['HOLD'] =  lambda x: [{'tag': '111', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': e}} for e in split_subfields(x)]
mapping['ISSN'] =  lambda x: [{'tag': '112', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}]
mapping['ISSUE'] = process_ISSUE #lambda x: [{'tag': '113', 'ind1': ' ', 'ind2': ' ', 'subs': {'a' : x}}]
mapping['LANG'] =  lambda x: [{'tag': '114', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': languages[e.title()]}} for e in flatten_list([a.split('/') for a in split_subfields(x)])]
mapping['OS'] =    lambda x: [{'tag': '115', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}]
mapping['PDATE'] = lambda x: [{'tag': '116', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}]
mapping['PNOTE'] = lambda x: [{'tag': '117', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': '; '.join(x.strip(whitespace+'|').split('\n        |'))}}]
mapping['PSTAT'] = lambda x: [{'tag': '118', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}]
mapping['PUB'] =   lambda x: [{'tag': '119', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}]
mapping['REG'] =   lambda x: [{'tag': '120', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}]
mapping['STAMP'] = process_UP('121')
mapping['ROUTE'] = process_ROUTE
mapping['SUB'] =   lambda x: [{'tag': '123', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}]
mapping['TI'] =    lambda x: [{'tag': '124', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}]
mapping['TNOTE'] = lambda x: [{'tag': '125', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}]
mapping['TYPE'] =  lambda x: [{'tag': '126', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}]
# mapping['VADDR'] = lambda x: [{'tag': '127', 'ind1': ' ', 'ind2': ' ', 'subs': {'a' : x}}]
mapping['VCODE'] = lambda x: [{'tag': '128', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}]
mapping['UP'] =    process_STAMP
Exemplo n.º 38
0
    def fit_mixture_model(self):
        unique_subjects = np.unique(self.dataset['subject'])
        unique_n_items = np.unique(self.dataset['n_items'])

        # Initialize empty arrays
        em_fits_keys = ['kappa', 'mixt_target', 'mixt_nontargets', 'mixt_nontargets_sum', 'mixt_random', 'train_LL', 'K', 'aic', 'bic']

        self.dataset['em_fits'] = dict()
        for k in em_fits_keys:
            self.dataset['em_fits'][k] = np.nan*np.empty(self.dataset['probe'].size)

        self.dataset['em_fits']['resp_target'] = np.nan*np.empty(self.dataset['probe'].size)
        self.dataset['em_fits']['resp_nontarget'] = np.nan*np.empty(self.dataset['probe'].size)
        self.dataset['em_fits']['resp_random'] = np.nan*np.empty(self.dataset['probe'].size)

        self.dataset['em_fits_subjects_nitems_trecall'] = np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size), dtype=np.object)
        self.dataset['em_fits_nitems_trecall'] = np.empty((unique_n_items.size, unique_n_items.size), dtype=np.object)
        self.dataset['em_fits_subjects_nitems'] = np.empty((unique_subjects.size, unique_n_items.size), dtype=np.object)

        # for subject_i, subject in enumerate(unique_subjects):
        #     self.dataset['em_fits_subjects_nitems_trecall'][subject] = dict()
        #     for n_items_i, n_items in enumerate(unique_n_items):
        #         self.dataset['em_fits_subjects_nitems_trecall'][subject][n_items] = dict()

        self.dataset['em_fits_nitems_trecall_mean'] = dict(mean=dict(), std=dict(), values=dict())

        # Compute mixture model fits per n_items, subject and trecall
        for n_items_i, n_items in enumerate(unique_n_items):
            for subject_i, subject in enumerate(unique_subjects):
                for trecall_i, trecall in enumerate(np.arange(1, n_items + 1)):
                    ids_filtered = ((self.dataset['subject']==subject) & (self.dataset['n_items'] == n_items) & (self.dataset['probe'] == trecall) & (self.dataset.get('masked', False) == False)).flatten()
                    # Invert the order of storage, 0 -> last item probed, 1 -> second to last item probe, etc...
                    # trecall_i = n_items - trecall

                    print "Fit mixture model, %d items, subject %d, trecall %d, %d datapoints (%d)" % (n_items, subject, trecall, np.sum(ids_filtered), self.dataset['sizes_subject_nitems_trecall'][subject_i, n_items_i, trecall_i])

                    params_fit = em_circular_mixture_to_use.fit(self.dataset['response_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], self.dataset['target_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], self.dataset['nontargets_subject_nitems_trecall'][subject_i, n_items_i, trecall_i])
                    params_fit['mixt_nontargets_sum'] = np.sum(params_fit['mixt_nontargets'])
                    # print self.dataset['response'][ids_filtered, 0].shape, self.dataset['item_angle'][ids_filtered, 0].shape, self.dataset['item_angle'][ids_filtered, 1:].shape

                    # cross_valid_outputs = em_circularmixture.cross_validation_kfold(self.dataset['response'][ids_filtered, 0], self.dataset['item_angle'][ids_filtered, 0], self.dataset['item_angle'][ids_filtered, 1:], K=10, shuffle=True, debug=False)
                    # params_fit = cross_valid_outputs['best_fit']
                    resp = em_circular_mixture_to_use.compute_responsibilities(self.dataset['response_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], self.dataset['target_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], self.dataset['nontargets_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], params_fit)

                    for k, v in params_fit.iteritems():
                        self.dataset['em_fits'][k][ids_filtered] = v

                    # params_fit['responsibilities'] = resp

                    self.dataset['em_fits']['resp_target'][ids_filtered] = resp['target']
                    self.dataset['em_fits']['resp_nontarget'][ids_filtered] = np.sum(resp['nontargets'], axis=1)
                    self.dataset['em_fits']['resp_random'][ids_filtered] = resp['random']

                    self.dataset['em_fits_subjects_nitems_trecall'][subject_i, n_items_i, trecall_i] = params_fit

                # Do not look at trecall (weird but whatever)
                params_fit = em_circular_mixture_to_use.fit(np.array(utils.flatten_list(self.dataset['response_subject_nitems_trecall'][subject_i, n_items_i, :n_items_i+1])), np.array(utils.flatten_list(self.dataset['target_subject_nitems_trecall'][subject_i, n_items_i, :n_items_i+1])), np.array(utils.flatten_list(self.dataset['nontargets_subject_nitems_trecall'][subject_i, n_items_i, :n_items_i+1])))

                self.dataset['em_fits_subjects_nitems'][subject_i, n_items_i] = params_fit


        for n_items_i, n_items in enumerate(unique_n_items):
            for k in ['mean', 'std', 'values']:
                self.dataset['em_fits_nitems_trecall_mean'][k][n_items] = dict()

            for trecall_i, trecall in enumerate(np.arange(1, n_items + 1)):
                for k in ['mean', 'std', 'values']:
                    self.dataset['em_fits_nitems_trecall_mean'][k][n_items][trecall] = dict()

                ## Now compute mean/std em_fits per n_items, trecall
                # Refit the model mixing all subjects together (not sure how we could get sem, 1-held?)
                params_fit = em_circular_mixture_to_use.fit(self.dataset['response_nitems_trecall'][n_items_i, trecall_i], self.dataset['target_nitems_trecall'][n_items_i, trecall_i], self.dataset['nontargets_nitems_trecall'][n_items_i, trecall_i])
                self.dataset['em_fits_nitems_trecall'][n_items_i, trecall_i] = params_fit

                # Need to extract the values for a subject/nitems pair, for all keys of em_fits. Annoying dictionary indexing needed
                for key in em_fits_keys:
                    fits_persubjects = [self.dataset['em_fits_subjects_nitems_trecall'][subject_i, n_items_i, trecall_i][key] for subject in np.unique(unique_subjects)]

                    self.dataset['em_fits_nitems_trecall_mean']['mean'][n_items][trecall][key] = np.mean(fits_persubjects)
                    self.dataset['em_fits_nitems_trecall_mean']['std'][n_items][trecall][key] = np.std(fits_persubjects)
                    self.dataset['em_fits_nitems_trecall_mean']['values'][n_items][trecall][key] = fits_persubjects

        ## Construct array versions of the em_fits_nitems mixture proportions, for convenience
        self.construct_arrays_em_fits()
Exemplo n.º 39
0
def main():
    global d
    if not d:
        d = get_items('BARCD')
    total = float(len(d))
    i = 1
    fb = open(export_dir+'ITEMS.marc.dat', 'wb')
    ft = open(export_dir+'ITEMS.marc.txt', 'w')
    print 'Exporting items...'
    item_count = 0
    for (recid, copies) in d.items():
        if not is_staff_paper(recid):
            record = Record()
            id_field = Field(tag='999', indicators=[' ', ' '], subfields=['a', recid, 'b', ALL[recid].get('ID', '')])
            record.add_ordered_field(id_field)
            for c in copies.items():
                aux = [(e[0], items_fix[e[0]](e[1])) for e in c[1].items() if e[0] in items_fix]
                item_field = Field(tag='945', indicators=[' ', ' '], subfields= ['b', c[0]]+flatten_list(aux))
                record.add_ordered_field(item_field)
                item_count = item_count + 1
            fb.write(record.as_marc())
            ft.write(str(record) + '\n==================\n')
        update_progress(i*100/total)
        i = i + 1
    print "\nRecords:\t" + str(int(total))
    print "Items:  \t" + str(item_count)
    fb.close()
    ft.close()
Exemplo n.º 40
0
            sys.exit("Selected mutation tool reports that it doesn't support the current project.")

    print("Creating mutants...")
    mdir, mutants = master.mutate()

    print("Scoring mutants in parallel...")
    divided_mutants = divide(mutants, args.scorers)

    # functools.partial is instead of lamda below, as the latter can't be pickled
    toolfun = functools.partial(load_tool, args.mutation_tool)
    scorefun = functools.partial(local_scorer.create_and_score, toolfun, cwd, mdir)

    with Pool(processes=args.scorers) as pool:
        nested_results = pool.map(scorefun, divided_mutants, 1)

    results = ScoringResult(flatten_list(nested_results))

    if not args.benchmark:
        print("Loading mutant metadata from the filesystem...")
        results.add_metadata(cwd, mdir)

    if args.ci_mode:
        passed = results.percentage_score >= args.ci_threshold
        reporter = load_reporter(args.reporter, results, passed, args.ci_threshold)
    else:
        reporter = load_reporter(args.reporter, results)

    print("Reporting mutation testing results...")
    reporter.report()

    sys.exit(1 if args.ci_mode and not passed else 0)
Exemplo n.º 41
0
    def find_lines(self, line, z=0, verbose=False):
        """
        Finds if there are any lines of a given type in the frequency range.
        The line frequencies are corrected for redshift.
        
        Parameters
        ----------
        line : :obj:`string`
               Line type to search for.
        z :    :obj:`float`
            Redshift to apply to the rest frequencies.
        verbose : :obj:`bool`
                  Verbose output?
        
        Returns
        -------
        n : :obj:`numpy.array`
            Principal quantum numbers. 
        reference_frequencies : :obj:`numpy.array`
                                Reference frequencies of the lines inside the spectrum in MHz. 
                                The frequencies are redshift corrected.
        
        See Also
        --------
        crrlpy.crrls.load_ref : Describes the format of line and the available ones.
        
        Examples
        --------
        >>> from crrlpy.spec import Spectrum
        >>> freq = [10, 11]
        >>> temp = [1, 1]
        >>> spec = Spectrum(freq, temp)
        >>> ns, rf = spec.find_lines('RRL_CIalpha')
        >>> ns
        array([ 843.,  844.,  845.,  846.,  847.,  848.,  849.,  850.,  851.,
                852.,  853.,  854.,  855.,  856.,  857.,  858.,  859.,  860.,
                861.,  862.,  863.,  864.,  865.,  866.,  867.,  868.,  869.])
        """
        
        if not isinstance(line, str):
            raise ValueError('line should be a string')
            
        # Load the reference frequencies.
        qn, restfreq = crrls.load_ref(line)
        
        # Correct rest frequencies for redshift.
        reffreq = restfreq/(1.0 + z)
        
        # Check which lines lie within the sub band.
        mask_ref = (self.x.compressed()[0] < reffreq) & \
                   (self.x.compressed()[-1] > reffreq)
        reffreqs = reffreq[mask_ref]
        refqns = qn[mask_ref]
        
        if not line in self.lines.keys():
            
            try:
                self.lines[line].append(refqns)
                self.lines[line+'_freq'].append(reffreqs)
            except KeyError:
                self.lines[line] = [refqns]
                self.lines[line+'_freq'] = [reffreqs]

            self.lines[line] = utils.flatten_list(self.lines[line])
            self.lines[line+'_freq'] = utils.flatten_list(self.lines[line+'_freq'])
            
        nlin = len(reffreqs)
        if verbose:
            print "Found {0} {1} lines within the subband.".format(nlin, line)
            if nlin > 1:
                print "Corresponding to n values: {0}--{1}".format(refqns[0], refqns[-1])
            elif nlin == 1:
                print "Corresponding to n value {0} and frequency {1} MHz".format(refqns[0], reffreqs[0])

        return refqns, reffreqs
 def best_points_allT(result_dist_to_use):
     return np.array(utils.flatten_list([best_points_T(result_dist_to_use, T) for T in T_space]))
Exemplo n.º 43
0
    def find_lines(self, line, z=0, verbose=False):
        """
        Finds if there are any lines of a given type in the frequency range.
        The line frequencies are corrected for redshift.
        
        Parameters
        ----------
        line : :obj:`string`
               Line type to search for.
        z :    :obj:`float`
            Redshift to apply to the rest frequencies.
        verbose : :obj:`bool`
                  Verbose output?
        
        Returns
        -------
        n : :obj:`numpy.array`
            Principal quantum numbers. 
        reference_frequencies : :obj:`numpy.array`
                                Reference frequencies of the lines inside the spectrum in MHz. 
                                The frequencies are redshift corrected.
        
        See Also
        --------
        crrlpy.crrls.load_ref : Describes the format of line and the available ones.
        
        Examples
        --------
        >>> from crrlpy.spec import Spectrum
        >>> freq = [10, 11]
        >>> temp = [1, 1]
        >>> spec = Spectrum(freq, temp)
        >>> ns, rf = spec.find_lines('RRL_CIalpha')
        >>> ns
        array([ 843.,  844.,  845.,  846.,  847.,  848.,  849.,  850.,  851.,
                852.,  853.,  854.,  855.,  856.,  857.,  858.,  859.,  860.,
                861.,  862.,  863.,  864.,  865.,  866.,  867.,  868.,  869.])
        """
        
        if not isinstance(line, str):
            raise ValueError('line should be a string')
            
        # Load the reference frequencies.
        qn, restfreq = crrls.load_ref(line)
        
        # Correct rest frequencies for redshift.
        reffreq = restfreq/(1.0 + z)
        
        # Check which lines lie within the sub band.
        mask_ref = (self.x.compressed()[0] < reffreq) & \
                   (self.x.compressed()[-1] > reffreq)
        reffreqs = reffreq[mask_ref]
        refqns = qn[mask_ref]
        
        if not line in self.lines.keys():
            
            try:
                self.lines[line].append(refqns)
            except KeyError:
                self.lines[line] = [refqns]
            #print self.lines[line]
            self.lines[line] = utils.flatten_list(self.lines[line])
            
        nlin = len(reffreqs)
        if verbose:
            print "Found {0} {1} lines within the subband.".format(nlin, line)
            if nlin > 1:
                print "Corresponding to n values: {0}--{1}".format(refqns[0], refqns[-1])
            elif nlin == 1:
                print "Corresponding to n value {0} and frequency {1} MHz".format(refqns[0], reffreqs[0])

        return refqns, reffreqs