Exemplo n.º 1
0
    def calculate_backtest_performance(self):
        """
        :return: returns a DataSet object with 2 keys: net_asset_value (pd.Series), holdings (pd.DataFrame)
        """
        net_asset_value_series = pd.Series()
        net_asset_value_series.loc[self.bt_dt_index[0]] = self.aum
        holdings_df = self.holdings

        for date in self.dt_df.index:
            # get previous date
            date_previous = self.dt_df.dt_previous.loc[date]
            # calculate pnl per instrument
            self.pnl_per_instrument.loc[date] = self._calculate_instruments_pnl(self.dt_df.loc[date])
            net_asset_value_now = self._update_net_asset_value(
                self.dt_df.loc[date], net_asset_value_series.loc[date_previous], holdings_df.loc[date_previous])
            # update nav at date t
            net_asset_value_series.loc[date] = net_asset_value_now
            # don't update holdings at last date as we have no open t+1 price
            if date != self.dt_df.index[-1]:
                holdings_df.loc[date] = self._update_holdings(
                    self.dt_df.loc[date], holdings_df.loc[date_previous], net_asset_value_now)
            if date.is_year_end and date.time().hour is 23:
                print("\nStrategy value for date " + str(date) + " for strategy calculated \n")
        backtest_dataset = DataSet()
        backtest_dataset.holdings = holdings_df.resample(self.frequency).last()
        backtest_dataset.net_asset_value = net_asset_value_series.resample(self.frequency).last()
        backtest_dataset.cumulative_pnl_per_instrument = self._get_pnl_attribution()
        self._backtest = backtest_dataset

        return self._backtest
Exemplo n.º 2
0
def sahm_unemployment_indicator(end_date=None):
    """
    returns a boolean series where True indicates that according to Claudia Sahm indicator economy will be in recession
    :param end_date: unemployment rate
    :return:dataset with indicator
    """
    if end_date is None:
        end_date = datetime.datetime.today().strftime('%Y-%m')

    # create Fred object and load data
    indicator = FRED('UNRATE')
    data = indicator.get_fred_time_series('1960-1', observation_end=end_date)

    #
    three_month_avg = data.rolling(window=3).mean()
    twelve_month_low = data.rolling(window=12).min()
    difference = three_month_avg - twelve_month_low
    indicator = data[difference > 0.5].index

    sahm_info = DataSet()
    sahm_info.recession_dates = indicator
    sahm_info.indicator = difference
    sahm_info.is_recession = difference > 0.5

    return sahm_info
Exemplo n.º 3
0
    def calculate_backtest_performance(self):

        # outputs a dataset with entries a) net_asset_value (a series) and b) holdings: a holdings dataframe
        net_asset_value_series = pd.Series()
        net_asset_value_series.loc[self.bt_dt_index[0]] = self.aum
        holdings_df = self.holdings

        for date in self.dt_df.index:
            # get previous date
            date_previous = self.dt_df.dt_previous.loc[date]
            # calculate pnl per instrument
            self.pnl_per_instrument.loc[date] = self._calculate_instruments_pnl(self.dt_df.loc[date])
            net_asset_value_now = self._update_net_asset_value(
                self.dt_df.loc[date], net_asset_value_series.loc[date_previous], holdings_df.loc[date_previous])
            # update nav at date t
            net_asset_value_series.loc[date] = net_asset_value_now
            # don't update holdings at last date as we have no open t+1 price
            # if date != self.trading_dt_index[-1]:
            if date != self.dt_df.index[-1]:
                holdings_df.loc[date] = self._update_holdings(
                    self.dt_df.loc[date], holdings_df.loc[date_previous], self.aum)
            if date.is_year_end and date.time().hour is 23:
                print("\nStrategy value for date " + str(date) + " for strategy calculated \n")
        backtest_dataset = DataSet()
        backtest_dataset.holdings = holdings_df
        backtest_dataset.net_asset_value = net_asset_value_series
        self._backtest = backtest_dataset

        return self._backtest
Exemplo n.º 4
0
def symmetric_cusum_filter(data, threshold):

    # set up containers for date_index, pos and negative sums
    dates, pos_sum, neg_sum = [], 0, 0
    diff = data.diff()
    pos_series = pd.Series(index=diff.index)
    neg_series = pd.Series(index=diff.index)

    # calculate cumulative sums and reset if > or below threshold
    for date in diff.index[1:]:
        pos_sum, neg_sum = max(0, pos_sum + diff.loc[date]), min(
            0, neg_sum + diff.loc[date])
        pos_series.loc[date] = pos_sum
        neg_series.loc[date] = neg_sum
        if neg_sum < -threshold:
            neg_sum = 0
            dates.append(date)
        elif pos_sum > threshold:
            pos_sum = 0
            dates.append(date)

    cumulative_series = pd.concat(
        [pos_series.cumsum(), neg_series.cumsum()], axis=1)
    cumulative_series.columns = ['pos_series', 'neg_series']

    cusum_filter = DataSet()
    cusum_filter.dates = pd.DatetimeIndex(dates)
    cusum_filter.threshold = threshold
    cusum_filter.cumulative_series = cumulative_series

    return cusum_filter
Exemplo n.º 5
0
 def set_dataset(self, data_path):
     train_set = DataSet(data_path['dataset']['train'])
     self.train_set, self.val_set = train_set.split(split_rate=0.1,
                                                    shuffle=True)
     self.test_set = DataSet(data_path['dataset']['test'])
     print(
         f'Train Size: {len(self.train_set)}, Val Size: {len(self.val_set)}, Test Size: {len(self.test_set)}'
     )
Exemplo n.º 6
0
 def test_relevance(self):
     dataset = DataSet()
     for i in range(0, 15):
         headline = dataset.stances[i]
         body = dataset.getBody(headline)
         print(headline)
         feature_set = FeatureFactory.get_feature_set(
             headline['Headline'], body)
         print(feature_set)
Exemplo n.º 7
0
def set_data_config(univ):
    """
    sets data specs including path, symbols, trading calendar
    :param univ: name of universe (e.g. universe_tech)
    :return data_config: config for data
    """
    data_config = DataSet()
    data_config.meta_data_root = './meta_data/universe_meta_data.json'
    data_config.symbols_list = univ

    return data_config
Exemplo n.º 8
0
    def test_multi_classification(self):
        '''
            Used to test the accuracy of related/unrelated headlines against bodies
        '''
        dataset = DataSet()
        features = []
        classifications = []
        for i in range(0, 2500):
            headline = dataset.stances[i]
            body = dataset.getBody(headline)
            feature_set = FeatureFactory.get_feature_set(
                headline['Headline'], body)
            ngram_hits = feature_set.n_grams[0]
            ngram_early_hits = feature_set.n_grams[1]
            polarity_headline = feature_set.polarity[0]
            polarity_body = feature_set.polarity[1]
            feature_sub_set = [
                feature_set.co_occurence, polarity_headline, polarity_body,
                ngram_hits, ngram_early_hits
            ]
            classification = headline['Stance']
            features.append(feature_sub_set)
            classifications.append(classification)

        classifier = svm.SVC()
        featuresArray = np.asarray(features)
        classificationArray = np.asarray(classifications)
        classifier.fit(featuresArray, classificationArray)

        print("SVM built and data fit")
        results = []
        correct_hits = 0
        for i in range(2500, 7500):
            headline = dataset.stances[i]
            body = dataset.getBody(headline)
            feature_set = FeatureFactory.get_feature_set(
                headline['Headline'], body)

            ngram_hits = feature_set.n_grams[0]
            ngram_early_hits = feature_set.n_grams[1]
            polarity_headline = feature_set.polarity[0]
            polarity_body = feature_set.polarity[1]
            feature_sub_set = [
                feature_set.co_occurence, polarity_headline, polarity_body,
                ngram_hits, ngram_early_hits
            ]
            classification = headline['Stance']
            prediction = classifier.predict(feature_sub_set)
            if classification == prediction[0]:
                correct_hits += 1

        print "Percent correct for multi-classification", (correct_hits / 5000)
Exemplo n.º 9
0
def get_head_body_tuples(include_holdout=False):
    # file paths
    splits_dir = "splits"
    dataset = DataSet()

    def get_stances(dataset, folds, holdout):
        # Creates the list with a dict {'headline': ..., 'body': ..., 'stance': ...} for each
        # stance in the data set (except for holdout)
        stances = []
        for stance in dataset.stances:
            if stance['Body ID'] in holdout and include_holdout == True:
                stances.append(stance)
            for fold in folds:  # TODO maybe just flatten folds beforehand
                if stance['Body ID'] in fold:
                    stances.append(stance)

        return stances

    # create new vocabulary
    folds, holdout = kfold_split(dataset, n_folds=10, base_dir=splits_dir)  # [[133,1334,65645,], [32323,...]] => body ids for each fold
    stances = get_stances(dataset, folds, holdout)

    print("Stances length: " + str(len(stances)))

    h = []
    b = []
    # create the final lists with all the headlines and bodies of the set except for holdout
    for stance in stances:
        h.append(stance['Headline'])
        b.append(dataset.articles[stance['Body ID']])

    return h, b
Exemplo n.º 10
0
def load_crypto_data_from_disk(universe_symbols, frequency):
    universe = Universe(universe_symbols, frequency)
    universe_data = DataSet()

    for crypto in universe_symbols:
        data_root = universe.universe_meta_data.loc[crypto]['DataRoot']
        if frequency != '1H':
            data_df = pd.read_csv(data_root + crypto + '_d.csv',
                                  skiprows=1,
                                  index_col='Date',
                                  parse_dates=True).sort_index()
        else:
            data_df = pd.read_csv(data_root + crypto + '_1h.csv',
                                  skiprows=1,
                                  index_col='Date',
                                  parse_dates=True).sort_index()
            data_df.index = pd.to_datetime(data_df.index,
                                           format='%Y-%m-%d %I-%p')
        # todo: proper dynamic frequency
        data_df = data_df.resample(frequency).last()
        for date in data_df.index:
            if data_df.loc[date].isna().sum() == data_df.shape[1]:
                print("Warning: After resampling, no data available for " +
                      crypto + " for date " + str(date))
        universe_data[crypto] = data_df

    universe.read_universe_data(universe_data)

    return universe


# close = universe.get_cross_sectional_view('Close')
# btcusd = universe.raw_data.BTCUSD.get_df_view(['Open', 'High', 'Low', 'Close', 'VolumeUSD'])
Exemplo n.º 11
0
 def test_stop_word_removal(self):
     dataset = DataSet()
     text = dataset.articles[140]
     tokenizedArticle = tokenizer.tokenize_text(text)
     lemmatizedText = tokenizer.lemmatize_text(tokenizedArticle)
     removedStopWords = tokenizer.remove_stop_words(lemmatizedText)
     self.assertTrue( len(removedStopWords) < len(lemmatizedText) )
Exemplo n.º 12
0
def doc2vecModelGenerator(lemma=True,
                          toLower=True,
                          punctuationRemove=True,
                          stopWordRemove=True):
    d = DataSet()
    headlines, bodies = [], []
    print("Preprocessing data...")
    for i, stance in tqdm(enumerate(d.stances)):
        _h = stance['Headline']
        _h = preprocessing(_h,
                           lemma=lemma,
                           toLower=toLower,
                           punctuationRemove=punctuationRemove,
                           stopWordRemove=stopWordRemove)
        headlines.append(_h)
        _b = d.articles[stance['Body ID']]
        _b = preprocessing(_b,
                           lemma=lemma,
                           toLower=toLower,
                           punctuationRemove=punctuationRemove,
                           stopWordRemove=stopWordRemove)
        bodies.append(_b)

    print("Tagging data...")
    h_tagged_data = [
        TaggedDocument(words=nltk.word_tokenize(_d), tags=[str(i)])
        for i, _d in tqdm(enumerate(headlines))
    ]
    b_tagged_data = [
        TaggedDocument(words=nltk.word_tokenize(_d), tags=[str(i)])
        for i, _d in tqdm(enumerate(bodies))
    ]
    h_model = Doc2Vec(vector_size=5,
                      alpha=0.025,
                      min_alpha=0.00025,
                      min_count=1,
                      dm=1,
                      dm_concat=1,
                      epochs=100)
    b_model = Doc2Vec(vector_size=20,
                      alpha=0.025,
                      min_alpha=0.00025,
                      min_count=1,
                      dm=1,
                      dm_concat=1,
                      epochs=100)
    print("doc2vec Model Vocab Building...")
    h_model.build_vocab(h_tagged_data, progress_per=5000)
    b_model.build_vocab(b_tagged_data, progress_per=5000)

    print("doc2vec Model Vocab Training...")
    h_model.train(h_tagged_data,
                  total_examples=h_model.corpus_count,
                  epochs=h_model.epochs)
    b_model.train(b_tagged_data,
                  total_examples=b_model.corpus_count,
                  epochs=b_model.epochs)
    h_model.save("models/h_d2v.model")
    b_model.save("models/b_d2v.model")
Exemplo n.º 13
0
 def get_futures_prices_dataset(self):
     series_types = ['cont', 'near', 'far']
     prices_dataset = DataSet()
     for series in series_types:
         prices_dataset[series] = self._get_cross_sectional_view(
             attribute=series)
     self.price_data = prices_dataset
     return prices_dataset
Exemplo n.º 14
0
 def __init__(self, universe, frequency=""):
     """
     :param universe: the group of symbols for which meta data is loaded, see configs/universe.py
     """
     self.data_config = set_data_config(universe)
     self._universe_meta_data = None
     self.frequency = frequency
     self.symbols = self.data_config.symbols_list
     self.raw_data = DataSet()
Exemplo n.º 15
0
def main(input_kwargs, network_kwargs, learning_kwargs):
    # ----------------------------------------------------------------------- #
    # DataSet reading
    print("\n" + "#" * 64)

    ds = DataSet(**input_kwargs)
    training_set, gene_set = ds.split(0.8)

    print("**** Data : %s" % input_kwargs)
    ds.display_struct()
    # ds.head()
    print()

    print("**** Training Set :")
    training_set.display_struct()
    print()

    print("**** Generalisation Set :")
    gene_set.display_struct()
    print()

    # ----------------------------------------------------------------------- #
    # Initialization
    # Network
    print("\n" + "#" * 64)

    network = HNN(dim_in=ds.dim_in,
                  dim_out=ds.dim_out,
                  build_params=network_kwargs)

    print("**** Original Network :")
    network.display_params()
    network.pprint()
    print()

    # ----------------------------------------------------------------------- #
    # Learning
    print("\n" + "#" * 64)

    # Training
    try:
        network.fit(training_set.input_set, training_set.output_set,
                    **learning_kwargs)
    except KeyboardInterrupt:
        print()
        print("/!\\ Fitting stopped. /!\\")
        print()

    # Test
    predictions = network.predict(training_set.input_set)
    print("Rights on training set: %.1f %%" %
          (100 * ds.rights_ratio(predictions, training_set.output_set)))
    predictions = network.predict(gene_set.input_set)
    print("Rights on gene set: %.1f %%" %
          (100 * ds.rights_ratio(predictions, gene_set.output_set)))
    print()

    print("**** Trained Network :")
    network.pprint()
Exemplo n.º 16
0
def main():
    dataset = DataSet()
    #print dataset.articles.keys()[132]
    #print dataset.articles[dataset.articles.keys()[1]]
    print
    print
    print
    print
    for i in range(10, 20):
        print dataset.stances[i]
Exemplo n.º 17
0
def do_reg():
    d = DataSet()
    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)

    Xs = dict()
    ys = dict()

    # Load/Precompute all features now
    X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout")
    for fold in fold_stances:
        Xs[fold], ys[fold] = generate_features(fold_stances[fold], d,
                                               str(fold))

    best_score = 0
    best_fold = None

    # Classifier for each fold
    for fold in fold_stances:
        ids = list(range(len(folds)))
        del ids[fold]

        X_train = np.vstack(tuple([Xs[i] for i in ids]))
        y_train = np.hstack(tuple([ys[i] for i in ids]))

        X_test = Xs[fold]
        y_test = ys[fold]

        clf_stage1 = GradientBoostingClassifier(n_estimators=200,
                                                random_state=14128,
                                                verbose=True)
        #clf = GradientBoostingClassifier(n_estimators=50, random_state=14128, verbose=False)
        # Try random forest
        clf.fit(X_train, y_train)

        predicted = [LABELS[int(a)] for a in clf.predict(X_test)]
        actual = [LABELS[int(a)] for a in y_test]

        fold_score, _ = score_submission(actual, predicted)
        max_fold_score, _ = score_submission(actual, actual)

        score = fold_score / max_fold_score

        print("Score for fold " + str(fold) + " was - " + str(score))
        if score > best_score:
            best_score = score
            best_fold = clf

    #Run on Holdout set and report the final score on the holdout set
    predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
    actual = [LABELS[int(a)] for a in y_holdout]

    report_score(actual, predicted)
Exemplo n.º 18
0
class Comparis(object):

    def __init__(self, **kwargs):
        self.strategies = DataSet()
        for key, value in kwargs.items():
            self.strategies[key] = value

    def create_results_comparison(self):
        results = pd.DataFrame()
        for name, result in self.strategies.items():
            results = pd.concat([results, result.calculate_results_overview()], axis=1)
        format_dict = {'mean_ann_return': '{:.2%}'}
        # results.style.format(format_dict)
        return results.sort_values(by='sharpe', axis=1, ascending=False)

    def plot_cumulative_returns(self):
        navs = pd.DataFrame()
        for item, strategy in self.strategies.items():
            nav = strategy.net_asset_value.copy()
            nav.name = strategy.name
            navs = pd.concat([navs, nav], axis=1)
        navs.dropna().plot()
Exemplo n.º 19
0
    def save(self, name, suffix=''):
        data = DataSet(name=name, path='fnc-1')
        stances = pd.DataFrame(data=data.stances)
        stances.set_index('Body ID', drop=True, inplace=True)

        articles = pd.DataFrame.from_dict(data.articles, orient='index')
        articles.rename(columns={0: 'body'}, inplace=True)
        articles.index.names = ['Body ID']

        df = pd.merge(stances,
                      articles,
                      how='left',
                      left_index=True,
                      right_index=True)

        df.to_pickle(self.filename.format(name=name))
        return df
Exemplo n.º 20
0
def load_equities_data_from_disk(universe_symbols, frequency):
    universe = Universe(universe_symbols, frequency)
    universe_data = DataSet()

    for equity in universe_symbols:
        data_root = universe.universe_meta_data.loc[equity]['DataRoot'] + '/'
        file_end = '.parquet.gzip'
        data_df = pd.read_parquet(data_root + equity + file_end)
        # todo: proper dynamic frequency
        data_df = data_df.resample(frequency).last()
        for date in data_df.index:
            if data_df.loc[date].isna().sum() == data_df.shape[1]:
                print("Warning: After resamling, no data available for " +
                      equity + " for date " + str(date))
        universe_data[equity] = data_df

    universe.read_universe_data(universe_data)

    return universe
Exemplo n.º 21
0
def log_training_data_features(training_size, features):
    TRAINING_SIZE = 50
    dataset = DataSet()
    segments = segmentize_dataset(dataset)
    train_headlines, train_bodies, train_classifications = segments
    classifier = Classifier(train_headlines,
                            train_bodies,
                            train_classifications,
                            size=TRAINING_SIZE,
                            features=['co_occurence', 'n_grams', 'word_overlap']
                            )
    logging_results = []
        score = Scorer.report_score(test_classifications, predictions)
        logging_results.append(str(score) + "%")
        logging_results.append("==== USING ====")
        features_used = ", ".join(classifier.get_supported_features())
        logging_results.append("Features: {0}".format(features_used))
        logging_results.append("Training Size: {0}".format(TRAINING_SIZE))
        print ( "\n".join(logging_results) )


        # write out to log file
        with open("classifier_performance.log", "a") as LogFile:
            LogFile.write( "\n".join(logging_results) + "\n\n" )
Exemplo n.º 22
0
                                   "features/polarity." + name + ".npy")
    X_hand = gen_or_load_feats(hand_features, h, b,
                               "features/hand." + name + ".npy")

    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap]
    return X, y


if __name__ == "__main__":
    check_version()
    parse_params()

    datapath = '../../'

    #Load the training dataset and generate folds
    d = DataSet(path=datapath)
    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)

    # Load the competition dataset
    competition_dataset = DataSet("competition_test", path=datapath)
    X_competition, y_competition = generate_features(
        competition_dataset.stances, competition_dataset, "competition")

    Xs = dict()
    ys = dict()

    # Load/Precompute all features now
    X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout")
    for fold in fold_stances:
        Xs[fold], ys[fold] = generate_features(fold_stances[fold], d,
Exemplo n.º 23
0
                                                 lstm_bw_cell,
                                                 x,
                                                 dtype=tf.float32)

    # Linear activation, using rnn inner loop last output
    return tf.matmul(outputs[-1], weights['out']) + biases['out']


if __name__ == "__main__":

    if sys.version_info.major < 3:
        sys.stderr.write('Please use Python version 3 and above\n')
        sys.exit(1)

    print('Loading DataSet')
    d = DataSet()
    print('generating folds')
    folds, hold_out = kfold_split(d, n_folds=10)
    #folds=folds[:2]
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)

    print('Loading word2vec model')
    #model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

    mode_dict = ['word2vec', 'glove50', 'glove100', 'glove200', 'glove300']

    for dic_ind in range(1):
        mode = mode_dict[dic_ind]
        if mode == 'word2vec':
            model = gensim.models.KeyedVectors.load_word2vec_format(
                'GoogleNews-vectors-negative300.bin', binary=True)
Exemplo n.º 24
0
from utils.dataset import DataSet

crypto_cta_config = DataSet()

crypto_cta_config.position_limit = 1
crypto_cta_config.max_gross_leverage = 1
crypto_cta_config.max_net_leverage = 1
crypto_cta_config.intraday = True
# I might want to add asymmetric limits, i.e. different for longs and shorts
Exemplo n.º 25
0
from utils.dataset import DataSet
from utils.generate_test_splits import split
from utils.score import report_score

dataset = DataSet()
data_splits = split(dataset)

training_data = data_splits['training']
dev_data = data_splits['dev']
test_data = data_splits['test']

if __name__ == '__main__':
    agree = 0
    disagree = 0
    discuss = 0
    unrelated = 0

    for stance in dev_data:
        if (stance['Stance'] == "agree"):
            agree += 1
        elif (stance['Stance'] == "disagree"):
            disagree += 1
        elif (stance['Stance'] == "discuss"):
            discuss += 1
        else:
            unrelated += 1
    total = len(dev_data)
    print("Total examples: ", total)
    print("Agrees: ", agree / total)
    print("Disagrees: ", disagree / total)
    print("Discusses:", discuss / total)
Exemplo n.º 26
0
def do_test():
    # TRAIN
    d = DataSet()
    train_stances = d.stances
    X_train, y_train = generate_features(train_stances, d,
                                         "train")  #y_train are ints
    del d

    y_train_stage1 = [STAGE1_Y_MAP[y] for y in y_train]

    stage1_clf = RandomForestClassifier(n_estimators=200,
                                        n_jobs=4,
                                        random_state=14128,
                                        verbose=True)
    stage1_clf.fit(X_train, y_train_stage1)

    del y_train_stage1

    X_train_stage2 = [
        X_train[i] for i in range(len(X_train)) if y_train[i] != 3
    ]
    y_train_stage2 = [y for y in y_train if y != 3]
    #print(y_train_stage2)

    stage2_clf = RandomForestClassifier(n_estimators=200,
                                        n_jobs=4,
                                        random_state=14128,
                                        verbose=True)
    stage2_clf.fit(X_train_stage2, y_train_stage2)

    del X_train
    del y_train
    del X_train_stage2
    del y_train_stage2

    # TEST
    test_d = TestDataSet()
    test_stances = test_d.stances
    X_test = generate_test_features(test_stances, test_d,
                                    "test")  # TODO add embedding features

    stage1_predictions = [int(a) for a in stage1_clf.predict(X_test)]
    #print(stage1_predictions)
    related_ids = [
        i for i in range(len(stage1_predictions)) if stage1_predictions[i] == 0
    ]

    X_test_stage2 = [X_test[i] for i in related_ids]
    stage2_predictions = [int(a) for a in stage2_clf.predict(X_test_stage2)]
    #print(stage2_predictions)

    final_predictions = []
    for i in range(len(stage1_predictions)):
        if i in related_ids:
            prediction = stage2_predictions[related_ids.index(i)]
            final_predictions.append(prediction)
        else:
            final_predictions.append(3)
    #print(final_predictions)

    write_submission(test_d, final_predictions, 'submission.csv')
Exemplo n.º 27
0
def _train(backbone_name, path_to_data_dir, path_to_checkpoints_dir):
    dataset = DataSet(path_to_data_dir, mode=DataSet.Mode.TRAIN)
    dataloader = DataLoader(dataset,
                            batch_size=1,
                            shuffle=True,
                            num_workers=8,
                            pin_memory=True)

    backbone = BaseModel.select_model(backbone_name)(pretrained=True)
    model = Model(backbone).to(DEVICE)
    optimizer = optim.SGD(model.parameters(),
                          lr=1e-3,
                          momentum=0.9,
                          weight_decay=0.0005)
    scheduler = StepLR(optimizer, step_size=50000, gamma=0.1)

    step = 0
    time_checkpoint = time.time()
    losses = deque(maxlen=100)
    should_stop = False

    num_steps_to_display = 20
    num_steps_to_snapshot = 10000
    num_steps_to_stop_training = 70000

    print('Start training')

    while not should_stop:
        for batch_index, (_, image_batch, _, bboxes_batch,
                          labels_batch) in enumerate(dataloader):
            assert image_batch.shape[
                0] == 1, 'only batch size of 1 is supported'

            image = image_batch[0].to(DEVICE)
            bboxes = bboxes_batch[0].to(DEVICE)
            labels = labels_batch[0].to(DEVICE)

            forward_input = Model.ForwardInput.Train(image,
                                                     gt_classes=labels,
                                                     gt_bboxes=bboxes)
            forward_output = model.train().forward(forward_input)

            loss = forward_output.anchor_objectness_loss + forward_output.anchor_transformer_loss + \
                forward_output.proposal_class_loss + forward_output.proposal_transformer_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            losses.append(loss.item())
            step += 1

            if step % num_steps_to_display == 0:
                elapsed_time = time.time() - time_checkpoint
                time_checkpoint = time.time()
                steps_per_sec = num_steps_to_display / elapsed_time
                avg_loss = sum(losses) / len(losses)
                lr = scheduler.get_lr()[0]
                print(
                    '[Step {}] Avg. Loss = {}, Learning Rate = {} ({} steps/sec)'
                    .format(step, avg_loss, lr, steps_per_sec))

            if step % num_steps_to_snapshot == 0:
                path_to_checkpoint = model.save(path_to_checkpoints_dir, step)
                print('Model saved to {}'.format(path_to_checkpoint))

            if step == num_steps_to_stop_training:
                should_stop = True
                break

    print('Done')
        # random.shuffle(data_tags)
    else:
        data_tags = sim_args.data_sets
    for data_tag in data_tags:
        assert data_tag in DATASET_COLLECTION, 'Command line input is currently not supported.'
        yield DATASET_COLLECTION[data_tag]


DATASET_COLLECTION = {}
DATASET_COLLECTION['NP2003'] = DataSet(
    '2003_np',
    '/zfs/ilps-plex1/slurm/datastore/hooster2/datasets/2003_np_dataset/Fold*/',
    'bin',
    True,
    59,
    multileave_feat=[
        range(11, 16),  #TF-IDF
        range(21, 26),  #BM25
        range(26, 41),  #LMIR
        [41, 42],  #SiteMap
        [49, 50]  #HITS
    ])
DATASET_COLLECTION['NP2004'] = DataSet(
    '2004_np',
    '/zfs/ilps-plex1/slurm/datastore/hooster2/datasets/2004_np_dataset/Fold*/',
    'bin',
    True,
    64,
    multileave_feat=[
        range(11, 16),  #TF-IDF
        range(21, 26),  #BM25
Exemplo n.º 29
0
    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap, X_overlap_quotes,
              X_overlap_pos, X_overlap_pos_sentence, X_tfidf, X_tfidf_max,
              X_overlap_bpe_SS]
    return X, y


if __name__ == "__main__":
    check_version()

    print('Running Conditioned CNN on FNC1 Dataset')
    dl_model_pred, _unused1, _unused2 = get_predictions_from_FNC_1_Test(
        params.dl_weights_file, params.apply_pos_filter, DEVICE)

    #Load the training dataset and generate folds
    d = DataSet()
    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)

    # Load the competition dataset
    competition_dataset = DataSet("competition_test")
    stances = pd.DataFrame(competition_dataset.stances)
    X_competition, y_competition = generate_features(
        competition_dataset.stances, competition_dataset, "competition")

    Xs = dict()
    ys = dict()

    # Load/Precompute all features now
    X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout")
    for fold in fold_stances:
Exemplo n.º 30
0
        gen_or_load_feats(bow_averaged_vectors, h, b,
                          "features/bowvec_200dnorm." + name + ".npy"))
    X_bowc = np.array(
        gen_or_load_feats(bow_count_vectors, h, b,
                          "features/bowcount_1000." + name + ".npy"))
    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap, X_bowv, X_bowc]
    print("... Done. Features :", X.shape[1])
    return X, y


if __name__ == "__main__":
    check_version()
    parse_params()

    #Load the training dataset and generate folds
    d = DataSet(path='../data/train')
    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)

    # Load the competition dataset
    competition_dataset = DataSet("competition_test",
                                  path='../data/competition_test')
    X_competition, y_competition = generate_features(
        competition_dataset.stances, competition_dataset, "competition")

    print("competition test", X_competition.shape)
    Xs = dict()
    ys = dict()

    # Load/Precompute all features now
    X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout")