예제 #1
0
def make_plots(tweets, box, place):
    """Make plots from the extracted tweet data.

    Args:
        tweets: Iterable of tweets already filtered and within the
            bounding box.
        box = A pair of longitude and latitude pairs, with the southwest
            corner of the bounding box coming first.
        place = String for the place name of the bounding `box`.
    """
    print 'Extracting tweets ...',
    data = list(process.extract_data(tweets))
    print 'DONE'

    print 'Extracting census blocks ...',
    census_path = config.get('census', 'path')
    census_blocks = config.get('census', 'blocks')
    blocks = process.extract_blocks(os.path.join(census_path, census_blocks))
    print 'DONE'

    print 'Processing data ...',
    longitude, latitude, time, users = process.process_data(data)
    print 'DONE'

    print 'Computing census block interactions ...',
    interactions = process.compute_block_interactions(users, blocks)
    print 'DONE'

    print 'Saving census block interactions ...',
    with open('census-block-interactions.tsv', mode='w') as f:
        process.dump_interactions(interactions, f)
    print 'DONE'

    print 'Making figures ...',
    figures = []
    figures.append(make_map(longitude, latitude, time, box, place))
    figures.append(make_user_map(longitude, latitude, users, box, place))
    figures.append(make_user_checkins(users))
    figures.append(make_heatmap(longitude, latitude, box, place))
    figures.append(make_time(time))
    print 'DONE'

    print 'Saving figures ...',
    for figure in figures:
        figure.savefig('{0}.png'.format(figure.get_label()),
                       bbox_inches='tight',
                       pad_inches=0.1)
    print 'DONE'

    plt.show()
예제 #2
0
    def __getitem__(self, item):
        data = process_data(self.tweet[item], self.selected_text[item],
                            self.sentiment[item], self.tokenizer, self.max_len)

        return {
            'ids':
            torch.tensor(data["ids"], dtype=torch.long),
            'mask':
            torch.tensor(data["mask"], dtype=torch.long),
            'token_type_ids':
            torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start':
            torch.tensor(data["targets_start"], dtype=torch.long),
            'targets_end':
            torch.tensor(data["targets_end"], dtype=torch.long),
            'orig_tweet':
            data["orig_tweet"],
            'orig_selected':
            data["orig_selected"],
            'sentiment':
            data["sentiment"],
            'offsets':
            torch.tensor(data["offsets"], dtype=torch.long)
        }
예제 #3
0
def main():
    env = sys.argv[1]
    db_details = DB_DETAILS[env]
    source_db = db_details['SOURCE_DB']
    target_db = db_details['TARGET_DB']
    logger.add("data-copier.info",
               rotation="1 MB",
               retention="10 days",
               level="INFO"
               )
    logger.add("data-copier.err",
               rotation="1 MB",
               retention="10 days",
               level="ERROR"
               )
    # Establishing connection to MySQL DB
    mysql_conn = get_connection(source_db)
    # Reading data from base tables through DataFrames
    logger.info('Reading data')
    df_dim_products, df_dim_customers, df_fact_product_revenue_dly, \
        df_fact_revenue_dly = process_data(mysql_conn)
    # Loading data into facts and dim tables in Postgres
    logger.info('Loading data')
    load_data(df_dim_products, df_dim_customers, df_fact_product_revenue_dly, df_fact_revenue_dly, target_db)
예제 #4
0
pgpassword = ''
pgdatabase = 'popnet'

# DIFFERENT PATHS ------------------------------------------------------------------------------------------------------
# Get path to main script
python_script_dir = os.path.dirname(os.path.abspath(__file__))

# Paths for the data / folders in the Project_data folder --------------------------------------------------------------
#path to ancillary data folder
ancillary_data_folder_path = os.path.join(python_script_dir, "data",
                                          "ancillary")
#path to GADM folder
gadm_folder_path = os.path.join(python_script_dir, "data", "GADM")
#path to GHS folder
ghs_folder_path = os.path.join(python_script_dir, "data", "GHS")

# Paths to storage during the data preparation (AUTOMATICALLY CREATED) -------------------------------------------------
#path to temp folder - will contain temporary files
temp_folder_path = os.path.join(python_script_dir, "temp")
#Files to be merged folder
merge_folder_path = os.path.join(python_script_dir, "Tif_to_merge")
#path to data folder to store the final tif files
finished_data_path = os.path.join(python_script_dir, "Finished_data")

# Process all data -----------------------------------------------------------------------------------------------------
process_data(country, pgpath, pghost, pgport, pguser, pgpassword, pgdatabase,
             ancillary_data_folder_path, gadm_folder_path, ghs_folder_path,
             temp_folder_path, merge_folder_path, finished_data_path,
             init_prep, init_import_to_postgres, init_run_queries,
             init_export_data, init_rasterize_data, init_merge_data)
from input import import_data
from process import process_data


# Read the data
data = import_data('data.txt')
data = process_data(data)
예제 #6
0
from process import process_data
import config
import joblib
import torch
from sklearn import model_selection
from model import EntityModel
from dataset import EntityDataset
from transformers import AdamW, get_linear_schedule_with_warmup
from engine import train_fn, eval_fn
import numpy as np

if __name__ == '__main__':

    sentences, pos, tag, enc_pos, enc_tag = process_data(config.TRAINING_FILE)

    meta_data = {"enc_pos": enc_pos, "enc_tag": enc_tag}

    joblib.dump(meta_data, "meta.bin")

    num_pos = len(list(enc_pos.classes_))
    num_tag = len(list(enc_tag.classes_))

    (train_sentences, test_sentences, train_pos, test_pos, train_tag,
     test_tag) = model_selection.train_test_split(sentences,
                                                  pos,
                                                  tag,
                                                  random_state=42,
                                                  test_size=0.1)

    train_dataset = EntityDataset(texts=train_sentences,
                                  pos=train_pos,
예제 #7
0
    data = np.array(data)
    train_size = int(data.shape[0] * percent)
    train = data[0: train_size, :]
    test =  data[train_size: , :]

    x_train = train[:, 0: -1]
    y_train = train[:, -1]

    x_test = test[:, 0: -1]
    y_test = test[:, -1]

    return x_train, y_train, x_test, y_test

#Begin to train
if __name__ == "__main__":
    process.process_data()

    x_train, y_train, x_test, y_test = get_data()

    adaBoost = ensemble.AdaBoostClassifier(DecisionTreeClassifier, 1)
    adaBoost.fit(x_train, y_train)
    ytest_ = adaBoost.predict(x_test)
    print(classification_report(y_test, ytest_))

    adaBoost = ensemble.AdaBoostClassifier(DecisionTreeClassifier, 2)
    adaBoost.fit(x_train, y_train)
    ytest_ = adaBoost.predict(x_test)
    print(classification_report(y_test, ytest_))

    adaBoost = ensemble.AdaBoostClassifier(DecisionTreeClassifier, 5)
    adaBoost.fit(x_train, y_train)
예제 #8
0
    parser.add_argument("fine_tune")
    parser.add_argument("--hidden_size", default=512, type=int)
    parser.add_argument("--device", default="cuda")
    parser.add_argument("--batch_size", default=32, type=int)

    args = parser.parse_args()

    lang_model_names = ["scibert-base-cased", "xlnet-base-cased"]
    fine_tunes = [False]
            
    lang_model, tokenizer, lm_emb_size = parse_lang_model(args.lang_model_name)
    vocab_size = len(tokenizer)

    device = torch.device(args.device)

    train_loader = process_data(args.train_data, tokenizer, device, args.lang_model_name, batch_size=args.batch_size)
    dev_loader = process_data(args.dev_data, tokenizer, device, args.lang_model_name, is_dev=True, batch_size=args.batch_size)

    model = LangModelWithDense(lang_model,
                               lm_emb_size,
                               args.hidden_size,
                               num_classes,
                               args.fine_tune).to(device)

    print(model)
    print("Using model: {}".format(args.lang_model_name))
    print("Using device: {}".format(device))
    print("Using fine-tuning: {}".format(args.fine_tune))
    print()

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
예제 #9
0
def main():
    env = sys.argv[1]
    db_details = DB_DETAILS[env]
    source_db = db_details['SOURCE_DB']
    target_db = db_details['TARGET_DB']
    # Read data from retail_db
    mysql_conn = get_connection(source_db)
    # Process data using pandas
    dim_products_df, dim_customers_df, fact_product_revenue_dly_df, df_fact_revenue_dly= process_data(mysql_conn)
    # Write the data to retail_dw
    load_data(dim_products_df, dim_customers_df, fact_product_revenue_dly_df, df_fact_revenue_dly, target_db)
예제 #10
0
import numpy as np
from process import process_data
from fontANN import feedforward
import sys

X = np.array([float(sys.argv[1]), float(sys.argv[2]), float(sys.argv[3])])
X = process_data(X)
npzfile = np.load('matrix.npz')
W2 = npzfile['W2']
B2 = npzfile['B2']
W1 = npzfile['W1']
B1 = npzfile['B1']
Y, Z = feedforward(X, W1, W2, B1, B2)
print(int(np.rint(Y[0])))
# return 1
예제 #11
0
    args = parser.parse_args()

    lang_model, tokenizer, lm_emb_size = parse_lang_model(args.lang_model)
    vocab_size = len(tokenizer)

    device = torch.device(args.device)

    print("Using model: {}".format(args.lang_model))
    print("Using device: {}".format(device))
    print("Using fine-tuning: {}".format(args.fine_tune))
    print()

    train_loader = process_data(args.train_data,
                                tokenizer,
                                device,
                                train_data=True,
                                fine_tune=args.fine_tune,
                                batch_size=args.batch_size)
    dev_loader = process_data(args.dev_data,
                              tokenizer,
                              device,
                              fine_tune=args.fine_tune,
                              batch_size=args.batch_size)

    model = LangModelWithDense(lang_model, vocab_size, lm_emb_size,
                               args.hidden_size, args.fine_tune).to(device)

    print(model)

    epochs = 10
    total_steps = len(train_loader) * epochs