예제 #1
0
 def __init__(self, data_file):
     self.utilities = Utilities()
     self.data_file = data_file
     self.processor = Processor({'training_file': data_file})
     self.segmenter = self.processor.load_segmenter()
     self.segments = []
     self.aspects = []
     self.sentiments = []
예제 #2
0
 def __init__(self, params={}, debug=0):
     self.parameters = params
     self.debug = 0
     self.num_params = len(params)
     self.info = {}
     self.results = {}
     self.logger = Logging()
     self.processor = Processor()
예제 #3
0
def my_api():
    # print
    params = dict(request.args)
    start = time.time()
    dict_data = Processor(params).process()
    print(time.time() - start)
    json_to_send = json.dumps(dict_data)

    return json_to_send
def process_hurricanes(fn, prefix, Basins, year_pairs, year_start_TC): # Define 
    # the function and its arguments: 
    
    # Read hurricane track data
    dataframe = pd.read_csv(fn) # Read data and create a variable named 'dataframe'
    del dataframe['Unnamed: 0'] # Delete column 'unnamed:0' from dataframe variable 

    # Specify hurricane to examine: >= 2007
    hurricanes = list(set(np.array(dataframe[     # Create a list of hurricanes 
            dataframe['SEASON'] >= year_start_TC  # IDs for hurricanes after 2007
        ]['ID'])))
    hurricanes.sort() # Sort hurricanes list 
    n = len(hurricanes) # n = 191 (number of hurricanes in the list) 

    df_lst = []
    for sy, ey in year_pairs:
        if case2use == 'noML':
            f = h5py.File(prefix + f'Argo_data_aggr_{sy}_{ey}.mat', 'r') #
        else:
            f = scipy.io.loadmat(prefix + f'Argo_data_aggr_{sy}_{ey}.mat')
        # Specify hurricane to examine: TC from 2007 to 2010; and counts the 
        # number of hurricanes 
        hurricanes = list(set(np.array(dataframe[
                (dataframe['SEASON'] >= sy) # IDs for hurricanes after 2007 
                &
                (dataframe['SEASON'] <= ey) # but before 2010
                ]['ID'])))
        hurricanes.sort() # Sort hurricanes list 
        n = len(hurricanes) # n = 66 (number of hurricanes in the list) 

        print(f'Processing years {sy} - {ey}...')

     # This cycle applies before_floats and add_after_floats functions 
     # from file Processor.py 
        for idx, h_id in enumerate(hurricanes):
            hurricane_df = dataframe[dataframe['ID'] == h_id] # 54 hurricanes 
            name = np.array(hurricane_df['NAME'])[0] # TOMAS
            season = np.array(hurricane_df['SEASON'])[0] # 2010
            num = np.array(hurricane_df['NUM'])[0] # 21
            print(f'Processing {idx+1} of {n}: {name} of {season} ({h_id}).')
            P = Processor(hurricane_df, f)
            P.generate_before_floats()
            if P.float_df.shape[0] == 0:
                print('No before floats')
                continue
            P.add_after_floats()
            pair_df = P.create_pair_df()
            
            if pair_df is not None:
                df_lst.append(pair_df.assign(HurricaneID=h_id))

    df = pd.concat(df_lst
            ).sort_values('before_t', ascending=False
            ).drop_duplicates('after_t').reset_index(drop=True)
    df['profile_dt'] = df['after_t'] - df['before_t']
    df['hurricane_dt'] = df['after_t'] - df['proj_t']
    df = df.assign(signed_angle=lambda r:
            - r.sign * r.angle)
    return df
예제 #5
0
def update_data(entity, level, skiplvl, times, runtimes):
    docs = get_doc_version(entity, level, time=runtimes)

    if docs.count() is 0 and times is not 0:
        update_data(entity, level + 1, times - 1, 0)

    if times is 0:
        return None

    for doc in docs:
        p = Processor(entity=entity, data=doc, level=level, skiplvl=skiplvl)

        if times == 0:
            times = p.doc_last_version

        for _ in range(level, level + times):
            mod = p.next()
            update_doc(entity, doc["_id"], mod)
            click.echo('End: <{}> {}'.format(entity, doc['_id']))

    return update_data(entity, level, skiplvl, times, runtimes + 1)
    def run_experiment(self, dataset_initial):
        for random_state in self.random_states:
            X_train = self.storage_path + dataset_initial + '_train_' + str(
                random_state) + '.csv'
            X_test = self.storage_path + dataset_initial + '_test_' + str(
                random_state) + '.csv'

            settings = {
                'training_file':
                X_train,
                'data_file':
                X_test,
                'max_reviews':
                None,  # Options: 0 to any integer | default: None (all)
                'output_file':
                self.storage_path + dataset_initial + '_output_' +
                str(random_state) + '.csv'
            }

            processor = Processor(settings=settings)
            processor.run()
예제 #7
0
def main():

    # if len(argv) != 2:
    #     print("Try harder in the future please")
    #     sys.exit()
    # elif not argv[1].isdigit():
    #     sys.exit()

    current_path, filename = os.path.split(os.path.abspath(__file__))
    above_path, _ = os.path.split(current_path)
    # f_path = str.format("%s/flocabulary.com/Grade%d.txt" % (current_path, int(argv[1])))
    # TODO: add additional sources
    # print(f_path)
    # f = open(f_path, 'r')
    # cleanList(f)

    # run processor
    current_dir = os.listdir(current_path)
    # check to see if the files needed are available
    # if they are not, then we run the processor
    # otherwise we skip this step
    # note, running the processor takes around 3 minutes
    if "table_StartWords.txt" not in current_dir or "table_SynWords.txt" \
            not in current_dir or "table_AntWords.txt" not in current_dir:

        for grade_level in range(1, 9):
            p = Processor(grade_level, current_path)
            print("Processing grade level " + str(grade_level))
            p.start()
            p.end()

    # now we add the things to the database
    db_create = Creator(current_path)
    print("Creating tables...")
    db_create.create_tables()
    print("Populating tables...")
    db_create.populate()
    db_create.end()

    # copy the database file to the main directory of the project
    source = current_path + "/init.db"
    destination = above_path + "/words.db"
    print("Moving database to main project directory.")
    copyfile(source, destination)

    print("\nSetup is complete.\n")
예제 #8
0
# Directory to save videos to.
ROOT_DIR = os.getcwd()
SAVE_DIR = f'{os.getcwd()}/videos'

# Create the server.
app = Flask(__name__)
api = Api(app)
valid_headers = ['Content-Type', 'Access-Control-Allow-Origin', '*']
cors = CORS(app, allow_headers=valid_headers)

# Connect to the database.
mongo = MongoDatabase()

# Video processing.
processor = Processor()

# Do a little server-side checking.
ALLOWED_EXTENSIONS = set(['webm', 'mp4', 'mp3', 'wav', 'jpeg', 'gif', 'png'])

# -------------------------------------------------------------
# Global functions.


def allowed_file(filename):
    '''Ensure we want to keep this file.'''
    return True


def validate_filepath(func):
    '''Decorator to validate a filepath from the frontend.'''
예제 #9
0
class Evaluator:
    def __init__(self, data_file):
        self.utilities = Utilities()
        self.data_file = data_file
        self.processor = Processor({'training_file': data_file})
        self.segmenter = self.processor.load_segmenter()
        self.stanford = Stanford()
        self.segments = []
        self.aspects = []
        self.sentiments = []
        self.prepare_aspect_sentiment_data()

    def calculate_evaluatio_matrices(self, labels, result):
        positives = 0
        negatives = 0

        for label in labels:
            if label == 1:
                positives += 1
            elif label == 0:
                negatives += 1

        evaluation_info = {
            'positives': positives,
            'negatives': negatives,
            # 'precision': "%.3f" % precision_score(labels, result),
            # 'recall': "%.3f" % recall_score(labels, result),
            'accuracy': "%.3f" % accuracy_score(labels, result),
            'f1_score': "%.3f" % recall_score(labels, result)
        }

        return evaluation_info

    def evaluate_segmentation(self):
        dataset = self.segmenter.features_and_labels
        all_data_transformed = self.segmenter.transform_categorical_numerical(dataset['data'], 'train')
        all_data_unique = self.utilities.get_unique_list_of_lists(all_data_transformed, dataset['labels'])

        # model = SGDClassifier()
        model = svm.SVC(kernel='linear')
        # model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes = (5, 2), random_state = 1) # Neural Network
        # model = MultinomialNB()
        # model = RandomForestClassifier(random_state=5)
        # model = tree.DecisionTreeClassifier(random_state=0)

        X = all_data_unique['data']
        y = all_data_unique['labels']

        f1_scores = cross_val_score(model, X, y, scoring='f1_micro', cv=5)
        print [round(score, 3) for score in f1_scores.tolist()]
        print("F1-score: %0.4f" % (f1_scores.mean()))

    def prepare_aspect_sentiment_data(self):
        data = self.utilities.get_segments_aspects_sentiments(self.data_file)
        self.segments = data['segments']
        self.aspects = data['aspects']
        self.sentiments = data['sentiments']

    def transform_aspect_name_list(self, name_list):
        id_list = []

        for name in name_list:
            id_list.append(self.lexicon.get_aspect_id_by_name(name))

        return id_list

    def evaluate_classifier(self, classifier, X, y):

        # Begin evaluation
        X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3, random_state=5)
        model = classifier.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        # *** save info for error analysis
        errors = []
        for index in range(0, len(X_test)):
            if y_test[index] != y_pred[index]:
                errors.append("\""+X_test[index] +"\",\""+ y_test[index]  +"\",\""+ y_pred[index]+"\"")

        str_out = "\n".join(errors)
        self.utilities.write_content_to_file('aspect_errors.csv', str_out)


        print(clsr(y_test, y_pred))


    def evaluate_aspect_extraction(self):
        X = self.segments
        y = self.aspects

        self.evaluate_classifier(self.processor.ml_asp_classifier, X, y)

    def transform_sentiment_classes(self, sentiment_names):
        sentiment_values = []
        for sentiment_name in sentiment_names:
            sentiment_values.append(self.utilities.sentiment_classes.index(sentiment_name))

        return sentiment_values

    def evaluate_sentiment_detection(self):
        X = self.segments
        y = self.sentiments

        # y = self.processor.ml_snt_classifier.merge_classes(y)
        self.evaluate_classifier(self.processor.ml_snt_classifier,X, y)
예제 #10
0
class Evaluator:
    def __init__(self, data_file):
        self.utilities = Utilities()
        self.data_file = data_file
        self.processor = Processor({'training_file': data_file})
        self.segmenter = self.processor.load_segmenter()
        self.segments = []
        self.aspects = []
        self.sentiments = []

    def calculate_evaluatio_matrices(self, labels, result):
        positives = 0
        negatives = 0

        for label in labels:
            if label == 1:
                positives += 1
            elif label == 0:
                negatives += 1

        evaluation_info = {
            'positives': positives,
            'negatives': negatives,
            # 'precision': "%.3f" % precision_score(labels, result),
            # 'recall': "%.3f" % recall_score(labels, result),
            'accuracy': "%.3f" % accuracy_score(labels, result),
            'f1_score': "%.3f" % recall_score(labels, result)
        }

        return evaluation_info

    def evaluate_segmentation(self):
        dataset = self.segmenter.features_and_labels
        all_data_transformed = self.segmenter.transform_categorical_numerical(
            dataset['data'], 'train')
        all_data_unique = self.utilities.get_unique_list_of_lists(
            all_data_transformed, dataset['labels'])

        # model = SGDClassifier()
        model = svm.SVC(kernel='linear')
        # model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes = (5, 2), random_state = 1) # Neural Network
        # model = MultinomialNB()
        # model = RandomForestClassifier(random_state=5)
        # model = tree.DecisionTreeClassifier(random_state=0)

        X = all_data_unique['data']
        y = all_data_unique['labels']

        f1_scores = cross_val_score(model, X, y, scoring='f1_micro', cv=5)
        print[round(score, 3) for score in f1_scores.tolist()]
        print("F1-score: %0.4f" % (f1_scores.mean()))

    def get_segments_gold_data(self):
        rows = self.utilities.read_from_csv(self.data_file)

        segments = []
        aspects = []
        sentiments = []
        for row in rows:
            comment = row[0]

            comment_parts = comment.split('**$**')
            for index, comment_part in enumerate(comment_parts):
                segment = self.utilities.clean_up_text(comment_part)
                segments.append(segment)
                aspect = row[index + 1]

                if len(aspect) < 1:
                    aspect = 'other neutral'
                elif aspect == 'noise':
                    aspect = 'noise neutral'

                aspect_cls = aspect.rsplit(' ', 1)[0]
                sentiment_cls = aspect.rsplit(' ', 1)[1]

                aspects.append(aspect_cls)
                sentiments.append(sentiment_cls)

        data = {
            'segments': segments,
            'aspects': aspects,
            'sentiments': sentiments
        }

        return data

    def evaluate_classifier(self, classifier, X, y, scoring='f1_micro'):
        # five fold cross-validation, test size 20%
        cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=11)
        scores = cross_val_score(classifier, X, y, cv=cv, scoring=scoring)

        print(sum(scores) / float(len(scores)))

        # # Begin evaluation
        # X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3, random_state=11)
        # model = classifier.fit(X_train, y_train)
        #
        # y_pred = model.predict(X_test)
        #
        # # *** save info for error analysis
        # errors = []
        # for index in range(0, len(X_test)):
        #     if y_test[index] != y_pred[index]:
        #         errors.append("\""+X_test[index] +"\",\""+ y_test[index]  +"\",\""+ y_pred[index]+"\"")
        #
        # str_out = "\n".join(errors)
        # self.utilities.write_content_to_file('aspect_errors.csv', str_out)
        #
        #
        # print(clsr(y_test, y_pred))

    def evaluate_aspect_extraction(self, X, y, merged=True):
        if merged is True:
            y = self.processor.ml_asp_classifier.merge_classes(y)

        self.evaluate_classifier(self.processor.ml_asp_classifier, X, y)

    def transform_sentiment_classes(self, sentiment_names):
        sentiment_values = []
        for sentiment_name in sentiment_names:
            sentiment_values.append(
                self.utilities.sentiment_classes.index(sentiment_name))

        return sentiment_values

    def evaluate_sentiment_detection(self, scoring='f1_micro', merged=True):

        data = self.get_segments_gold_data()
        X = data['segments']
        print(len(X))
        y = data['sentiments']

        if merged:
            y = self.processor.ml_snt_classifier.merge_classes(y)

        self.evaluate_classifier(self.processor.ml_snt_classifier,
                                 X,
                                 y,
                                 scoring=scoring)

    def get_category_counts(self, cat_type='aspect', merged=True):
        data = self.get_segments_gold_data()

        if cat_type == 'aspect':
            categories = data['aspects']
        elif cat_type == 'sentiment':
            categories = data['sentiments']
        else:
            return "Incorrect category type."

        if merged is True and cat_type == 'aspect':
            categories = self.utilities.merge_classes(categories)
        elif merged is True and cat_type == 'sentiment':
            categories = self.processor.ml_snt_classifier.merge_classes(
                categories)

        counter = Counter(categories)

        return counter
예제 #11
0
import NXOpen

# setup logging
user = os.getlogin()
timestamp = date.today().isoformat()

log_file = os.path.join(config.LOG_DIR, "{}_{}.log".format(timestamp, user))

logging.basicConfig(filename=log_file,
                    format='[%(asctime)s]%(levelname)s|%(name)s:%(message)s',
                    level=config.LOGGING_LEVEL)
logger = logging.getLogger(__name__)

session = NXOpen.Session.GetSession()
processor = Processor()

# log NX version
nx_version = session.GetEnvironmentVariableValue("NX_FULL_VERSION")
if not nx_version:  # NX 12 and prior
    nx_version = session.GetEnvironmentVariableValue("UGII_FULL_VERSION")
logger.info("NX Version: {}".format(nx_version))

# parse caller options
parser = argparse.ArgumentParser()
parser.add_argument("--select", action="store_true")
parser.add_argument("--work", action="store_true")
parser.add_argument("--all_open", action="store_true")
parser.add_argument("--mfg", action="store", nargs="*")

# parse arguments
예제 #12
0
                        dest='SVML_DRIVE',
                        default=False,
                        action='store_true')
    parser.add_argument('-d',
                        dest='CSV_FILE',
                        default='~/store/fraud_data/creditcard.csv')
    parser.add_argument('-yclass', dest='YCOL', default='Class')
    args = parser.parse_args()
    #arguments for running ml suite

    #driver - controller.py
    #CSV_FILE = '~/store/fraud_data/creditcard.csv'
    #YCOL = 'Class'
    logger = Logging()
    m = Model()
    proc = Processor()

    #processor
    data = proc.load_csv(args.CSV_FILE)
    data = proc.normalize_col(data, 'Amount')
    data = data.drop(['Time'], axis=1)
    print data[args.YCOL].value_counts()
    X = proc.get_xvals(data, args.YCOL)
    y = proc.get_yvals(data, args.YCOL)

    #processor xfolds
    Xu, yu = proc.under_sample(data, args.YCOL)
    Xu_train, Xu_test, yu_train, yu_test = proc.cross_validation_sets(
        Xu, yu, .3, 0)
    X_train, X_test, y_train, y_test = proc.cross_validation_sets(X, y, .3, 0)
예제 #13
0
def main(_):
    # Import data
    CSV_FILE = '~/store/fraud_data/creditcard.csv'
    YCOL = 'Class'
    logger = Logging()
    proc = Processor()

    #TODO make this test suite
    data = proc.load_csv(CSV_FILE)
    data = proc.normalize_col(data, 'Amount')
    data = data.drop(['Time'], axis=1)
    X = proc.get_xvals(data, YCOL)
    y = proc.get_yvals(data, YCOL)
    #print data.describe()
    Xu, yu = proc.under_sample(data, YCOL)
    Xu_train, Xu_test, yu_train, yu_test = proc.cross_validation_sets(
        Xu, yu, .3, 0)
    X_train, X_test, y_train, y_test = proc.cross_validation_sets(X, y, .3, 0)
    x = tf.placeholder(tf.float32, [None, 29])
    W = tf.Variable(tf.zeros([29, 1]))
    b = tf.Variable(tf.zeros([1]))
    y = tf.matmul(x, W) + b

    # Define loss and optimizer
    y_ = tf.placeholder(tf.float32, [None, 1])

    # The raw formulation of cross-entropy,
    #
    #   tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)),
    #                             )    reduction_indices=[1]))
    #
    # can be numerically unstable.
    #
    # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
    # outputs of 'y', and then average across the batch.
    #cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))

    #cross_entropy = -tf.reduce_sum(y_*tf.log(tf.clip_by_value(y,1e-10,1.0)))
    cross_entropy = tf.reduce_sum(tf.square(tf.subtract(y_, y)))
    train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy)

    sess = tf.InteractiveSession()
    tf.global_variables_initializer().run()
    # Train
    y_test = y_test.as_matrix()
    for i in range(20):
        #batch_xs, batch_ys = mnist.train.next_batch(100)
        #batch_xs = X_train
        #batch_ys = y_train.as_matrix()
        sess.run(train_step, feed_dict={x: X_train, y_: y_train.as_matrix()})
        # Test trained model
        print("[model] training is complete ***************** ")
        correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
        accuracy = tf.reduce_mean(
            tf.subtract(tf.cast(correct_prediction, tf.float32),
                        y_test[:10000]))

        print('accuracy: %s' % sess.run(accuracy,
                                        feed_dict={
                                            x: X_test.head(10000),
                                            y_: y_test[:10000]
                                        }))
    #cp = sess.run(tf.cast(correct_prediction, tf.float32), feed_dict={x: X_test.head(10000), y_: y_test[:10000]})
    #lacc = tf.subtract(tf.cast(correct_prediction, tf.float32), y_test[:10000])
    #cp = sess.run(lacc, feed_dict={x: X_test.head(10000), y_ : y_test[:10000]})
    #count = 0
    #for idx, c in enumerate(cp):
    #if c != y_test[idx]:
    ##print(idx, c, y_test[idx])
    #continue
    #else:
    #count +=1
    #print((count/float(10000)))
    sess.close()