def __init__(self):
        self.__db_manager = DBManager()
        self.__helper = GeneralHelpers()
        self.__plot_manager = PlotManager()
        self.__import_manager = ImportManager()
        self.__feature_manager = FeatureManager()

        self.years = ("2012", "2013", "2014", "2015")
class Main:
    """
    Main class, makes necessary function calls to necessary classes
    """

    def __init__(self):
        self.__db_manager = DBManager()
        self.__helper = GeneralHelpers()
        self.__plot_manager = PlotManager()
        self.__import_manager = ImportManager()
        self.__feature_manager = FeatureManager()

        self.years = ("2012", "2013", "2014", "2015")

    def retrieve_tweets(self, file_path_of_ids):
        """
        Runs Import Manager to retrieve and import tweets
        :param file_path_of_ids: String, file path of tweets to import
        :return: void
        """
        self.__import_manager.run(file_path_of_ids)

    def extract_features_and_generate_arff(self, n=3, analyzer='char', year='2012'):
        """
        Makes necessary function calls to extract features for given year and to generate arff file
        :param n: int, ngram count
        :param analyzer: string, word or char
        :param year: string, 2012, 2013, 2014, 2015 or ALL
        :return: string, path of generated arff file
        """

        # Getting tweets with year
        print("Getting tweets for year "+ year)
        tweets_for_given_year = self.__db_manager.get_tweets_for_year(year)

        print("Generating document and classes of tweets.")
        document, classes = self.__feature_manager.create_document_and_classes_for_tweets(tweets_for_given_year, True)

        print("Fitting the data, finding ngrams and frequencies.")
        ngrams, arff_data, vectorizer, X = self.__feature_manager.fit_data(document, classes, n, analyzer)

        print("Formatting the data for arff lib format.")
        formatted_arff_data = self.__feature_manager.format_data_for_arff(ngrams, arff_data)

        print("Generating file.")
        # Experiment name, 1grams, 2grams, 3grams.. or words
        experiment_name = str(n)+'Gram' if analyzer == 'char' else 'Word'

        # File name, TTNet_3grams_2012
        file_name = MODEL_NAME + '_' + experiment_name + '_' + year

        # File name randomized TTNet_3grams_2012_asfas12.arff
        file_name = self.__helper.generate_random_file_name(file_name, ARFF_FILE_EXTENSION)

        # Arff file path ...../DataSet-ARFF/3Gram/TTNet/TTNet_3grams_2012_asfas12.arff
        arff_file_path = PROJECT_ROOT_DIRECTORY + DATASET_ARFF_DIR_NAME + experiment_name + '/' + MODEL_NAME + '/'

        # Generating the file with data
        self.__helper.generate_arff_file(arff_file_path, file_name, formatted_arff_data)

        print("Arff file generated at path:"+arff_file_path+file_name)

    def run_experiment_with_scikit_learn(self, n=1, analyzer='word'):
        """
        Makes necessary method calls to run the experiment on scikit learn.
        :param n: int, count n in n-gram
        :param analyzer: string, either 'word' or 'char'
        :return: void
        """
        # Retrieving all tweets from database
        print("Retrieving all tweets from database.")
        tweets_for_all_years = {}
        # Iterating over all years
        for year in self.years:
            # Retrieving tweets for the year
            tweets_for_year = self.__db_manager.get_tweets_for_year(year)
            tweets_for_all_years[year] = tweets_for_year

        # Creating a big list of tweets
        print("Creating a big list of tweets.")
        all_tweets = []
        # Appending all tweets together
        for year, tweets in tweets_for_all_years.iteritems():
            all_tweets += tweets

        # Generating document
        print("Generating document and classes by preprocessing")
        # Preprocessing and generation of document
        document, classes = self.__feature_manager.create_document_and_classes_for_tweets(all_tweets, True)

        # Getting years' tweets counts
        print("Getting years' tweets counts.")
        years_tweets_counts = {}
        for year in self.years:
            years_tweets_counts[year] = len(tweets_for_all_years[year])

        all_processes = []
        self.all_experiments_results = []

        pool = Pool(cpu_count()-1 or 1)
        copy_reg.pickle(types.MethodType, self._reduce_method)

        print("Running experiments.")
        t0 = time.time()
        for i in range(0, N_EXPERIMENTS):
            print("Experiment:"+str(i))
            experiment_manager = ExperimentManager(i, years_tweets_counts, n, analyzer)
            r = pool.apply_async(experiment_manager.run_experiment, args=(document, classes,), callback=self._accumulate_experiments_scores)
            all_processes.append(r)

        for a_process in all_processes:
            a_process.wait()

        t1 = time.time()

        print("Elapsed time:", t1- t0, " seconds")

        pool.close()
        pool.join()

        print("Cumulating all the experiments' scores.")
        final_results_from_all_experiments = self.__helper.cumulate_years_scores(self.all_experiments_results)
        return final_results_from_all_experiments

    def _reduce_method(self, m):
        """

        :param m:
        :return:
        """
        if m.im_self is None:
            return getattr, (m.im_class, m.im_func.func_name)
        else:
            return getattr, (m.im_self, m.im_func.func_name)

    def _accumulate_experiments_scores(self, an_experiments_result):
        """
        Accumulates experiments' scores
        :return: void
        """
        an_experiments_result = self.__helper.calculate_relative_scores(an_experiments_result)
        self.all_experiments_results.append(an_experiments_result)

    def plot_experiment_results(self, root_dir):
        """
        Plots experiment's results from log files
        :param root_dir: string
        :return: void
        """
        lines_scores = self.__helper.get_accuracy_scores_for_experiment_years_from_root_dir(root_dir)
        self.__plot_manager.plot_experiments_results(lines_scores)

    def plot_all_experiment_results_with_scikit_learn(self, all_line_scores_of_all_experiments):
        """
        Plots all line scores of all experiments
        :param all_line_scores_of_all_experiments: dict
        :return: void
        """
        self.__plot_manager.plot_experiments_results_with_scikit_learn(all_line_scores_of_all_experiments)

    def plot_years_scores(self, root_dir):
        """
        Makes necessary function calls to plot years scores
        :param dir: string
        :return: void
        """
        self.__plot_manager.plot_years_scores_from_root_directory(root_dir)

    def plot_2012_vs_rest(self, root_dir):
        """
        Makes necessary function calls to plot 2012 vs REST scores
        :param root_dir: string
        :return: void
        """
        self.__plot_manager.plot_2012_vs_rest(root_dir)

    def plot_top_feature_frequencies_in_years(self):
        """
        Makes necessary function calls to plot top features frequencies' in years
        :return: void
        """
        years_features_counts = {}

        for year in self.years:
            years_features_counts[year] = self.find_frequency_dictionary_for_year(year)

        self.__plot_manager.plot_top_feature_frequencies_in_years(years_features_counts)

    def find_frequency_dictionary_for_year(self, year):
        """
        Finds frequencies of each feature for given year
        :param year: string
        :return: dict
        """
        # For this particular method, find_roots=True, n=1, analyzer=word because we're working with top info gain words

        tweets_for_the_year = self.__db_manager.get_tweets_for_year(year)
        document, classes = self.__feature_manager.create_document_and_classes_for_tweets(tweets_for_the_year, find_roots=True)
        ngrams, arff_data, vectorizer, X = self.__feature_manager.fit_data(document, classes, n=1, analyzer='word')

        terms = vectorizer.get_feature_names()
        freqs = X.sum(axis=0).A1

        result = sorted(zip(freqs, terms), reverse=True)

        freqs = [elm[0] for elm in result]
        terms = [elm[1] for elm in result]

        final_result = dict(zip(terms, freqs))

        return final_result

    def plot_years_intersection_scores(self):
        """
        Makes necessary function callst to plot a matrix which shows years' vocabularies similarities
        :return: void
        """
        years_features_counts = {}

        for year in self.years:
            years_features_counts[year] = self.find_frequency_dictionary_for_year(year)
            
        self.__plot_manager.plot_years_intersection_scores(years_features_counts)

    def import_new_tweets_from_csv(self, root_path):
        """

        :param root_path:
        :return:
        """
        self.__import_manager.import_new_tweets_from_csv(root_path)
    def evaluate(self, clf):
        # Predict the response for test dataset
        y_pred = clf.predict(self.X_test)
        self.logger.info("-> Predicted: {}".format(y_pred))
        self.logger.info("-> Correct: {}".format(self.y_test))
        self.logger.info("-> Accuracy: {}".format(
            accuracy_score(self.y_test, y_pred)))


if __name__ == "__main__":
    logger = ClassifierLogger().get_logger()
    dir_path = os.path.dirname(os.path.realpath(__file__))
    corpus = os.path.join(dir_path, "resources", "News_category_train.json")

    features = ['authors', 'headline', 'short_description']
    combinations = chain(
        *map(lambda x: combinations(features, x), range(0,
                                                        len(features) + 1)))

    for combination in combinations:
        if len(combination) == 0:
            continue
        feat_manager = FeatureManager(corpus, combination, logger)
        # classifierNB = Classifier(feat_manager, ["Naive Bayes", "Decision Tree", "Adaboost", "Support Vector Machine", "Random Forest", "Gradient Descent"])
        classifierNB = Classifier(
            feat_manager, ["Support Vector Machine", "Gradient Descent"],
            logger)
        # self.logger.info("TRAINING WITHOUT CLEANSING")
        # classifierNB.train(cleanse=False)
        classifierNB.train(cleanse=True)
import glob
import os
from FeatureManager import FeatureManager

features_path = 'epadb/test/data'
conf_path = 'conf'
epadb_root_path = 'EpaDB'
text_path = 'epadb/test/text'

feature_manager = FeatureManager(epadb_root_path, features_path, conf_path)

feature_manager.extract_features_using_kaldi(text_path)

#Create symbolic links to labels used in evaluation stage
for file in sorted(glob.glob('EpaDB/*/labels/*')):
    fullpath = os.path.abspath(file)
    basename = os.path.basename(file)
    #Get spkr id
    spkr = fullpath.split('/')[-3]
    labels_dir_for_spkr = 'evaluate/epadb_30/' + spkr + '/labels/'
    #Create directory for speaker's labels
    if not os.path.exists(labels_dir_for_spkr):
        os.system('mkdir -p ' + labels_dir_for_spkr)
    #Make symbolic link to speaker labels from EpaDB directory
    if not os.path.exists(labels_dir_for_spkr + '/' + basename):
        os.system('ln -s ' + fullpath + ' ' + labels_dir_for_spkr + '/')

#Handle symbolic links for reference transcriptions used in evaluation stage
if not os.path.exists('evaluate/epadb_30/reference_transcriptions.txt'):
    current_path = os.getcwd()
    print('ln -s ' + current_path + '/EpaDB/reference_transcriptions.txt ' +
Пример #5
0
                                   lang_graph,
                                   symbols_path,
                                   disam,
                                   acoustic_scale=1.0)
phones = SymbolTable.read_text(phones)
wb_info = WordBoundaryInfo.from_file(
    WordBoundaryInfoNewOpts(),
    "data/lang_test_tgsmall/phones/word_boundary.int")

# Instantiate the PyTorch acoustic model (subclass of torch.nn.Module)
model = FTDNN()
model.load_state_dict(torch.load(acoustic_model_path))
model.eval()

#Create feature manager
feature_manager = FeatureManager(epadb_root_path, data_path, conf_path)

align_out_file = open("gop/align_output", "w+")
# Decode and write output lattices
with DoubleMatrixWriter(loglikes_wspec) as loglikes_writer:
    for line in open(sample_list_path, 'r').readlines():
        logid = line.split()[0]
        #tkey, text = line.strip().split(None, 1)
        feats, text = feature_manager.get_features_for_logid(logid)
        text = text.upper()
        feats = torch.unsqueeze(feats, 0)
        loglikes = model(feats)  # Compute log-likelihoods
        loglikes = Matrix(
            loglikes.detach().numpy()[0])  # Convert to PyKaldi matrix
        loglikes_writer[logid] = loglikes
        out = aligner.align(loglikes, text)