示例#1
0
def weight_extract(model, optimizer, criterion, train_loader, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        if torch.cuda.is_available():
            data, target = Variable(data.cuda()), Variable(target.cuda())
        else:
            data, target = Variable(data), Variable(target)

        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)

        utils.c = target.view(-1, 1)  # batch array torch.tensor[128]
        utils.c = utils.c.type(torch.cuda.FloatTensor)
        utils.weight_extract_densenet(model.module)

        for i in utils.c:
            for j in i:
                utils.str_w = utils.str_w + str(j.tolist()) + ','
            utils.str_w += '\n'

        utils.save_to_csv()
        utils.str_w = ''

        if batch_idx % 100 == 0:
            print('Epoch: {}'.format(epoch))
def weight_extract_test(model, criterion, test_loader):
    utils.csv_file_name = 'weight_test.csv'
    model.eval()
    for batch_idx, (data, target) in enumerate(test_loader):
        if torch.cuda.is_available():
            data, target = Variable(data.cuda()), Variable(target.cuda())
        else:
            data, target = Variable(data), Variable(target)

        output = model(data)
        loss = criterion(output, target)

        utils.c = target.view(-1, 1)  # batch array torch.tensor[128]
        utils.c = utils.c.type(torch.cuda.FloatTensor)
        utils.weight_extract(model.module)

        for i in utils.c:
            for j in i:
                utils.str_w = utils.str_w + str(j.tolist()) + ','
            utils.str_w += '\n'

        utils.save_to_csv()
        utils.str_w = ''

    print('weight test extract')
def collecting_handler(message):
    """Collect orders"""

    # open sheet
    sh = gc.open_by_key(SPREADSHEET_ID)
    if message.chat.type == "group" or message.chat.type == "private":
        print(message)
        # read each sheet
        wks_quay_dau = sh.worksheet(ROTATION_NAME)

        # read data from group and write to sheets
        if message.chat.title == "KIỂM KHO QUẬN 7" or message.chat.title == "test":
            file_uri = f"./assets/csv/kiem_kho_{datetime.fromtimestamp(message.date).strftime('%Y-%m-%d')}.csv"
            # save text to csv file
            save_to_csv(file_uri, message)
            bot.send_message(message.chat.id, "Đã lưu vào file csv.")
        elif message.chat.title == "Đơn Quay Đầu - Bắn Kiểm Thiếu":
            # save to Quaydau_bot sheet
            save_to_sheet(wks_quay_dau, message)
            bot.send_message(
                message.chat.id,
                "Mã đơn hàng đã được lưu vào google sheets. Các anh chị có tên vui lòng hoàn tất đơn quay đầu trong ngày. Thanks.",
            )
        else:
            bot.send_message(message.chat.id, "Nothing to do...")
def main():

    transactions = pd.read_csv(INPUT_PATH,
                               sep="\t",
                               names=['user_id', 'item_id', 'rating', 'time'],
                               engine='python')
    # print(transactions.head())
    print(transactions.head())
    # convert to implicit scenario
    #transactions['rating'] = 1
    print(transactions.head())
    # make the dataset
    train_df, test_df = get_train_test_df(transactions)
    save_to_csv(train_df,
                OUTPUT_PATH_TRAIN,
                header=False,
                index=False,
                verbose=1)
    save_to_csv(test_df,
                OUTPUT_PATH_TEST,
                header=False,
                index=False,
                verbose=1)
    report_stats(transactions, train_df, test_df)
    return 0
示例#5
0
    def calculate_cdf(self, df, node):
        node_difference = node+'_difference'
        stats_df = df.groupby([node_difference])[node_difference].agg('count').pipe(pd.DataFrame).rename(columns={node_difference: 'frequency'})
        stats_df['pdf'] = stats_df['frequency'] / sum(stats_df['frequency'])
        stats_df['cdf'] = stats_df['pdf'].cumsum()
        stats_df = stats_df.reset_index()

        if not os.path.exists(self.path):
            os.mkdir(self.path)

        utils.save_to_csv(stats_df, self.path, 'latency_' + node)
        return stats_df
def deploy_zone_prediction():
    """
    generate java model for zone prediction
    :return: training result and java model
    """
    dir = "log/peps normal"
    pattern = r'(left|right|front|back|start|trunk|lock)\\\d{1,2}.csv$'
    pattern_valid = r'(3|6|9|12).csv$'
    utils.construct_set(dir, pattern, pattern_valid, filter=1)
    utils.save_to_csv()
    id = 'EightNormal'
    dir_path = 'model/'
    rf = utils.train_rf(model_id=id, ntrees=25, weight_lock=1)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    rf.download_pojo(path=dir_path, get_genmodel_jar=False)
def _create_set(tweets, set_name, index_name, nb_authors, previous_authors,
                author_mode, nb_tweets_per_author, nb_pos_pairs_per_tweet,
                nb_neg_pairs_per_tweet, previous_tweets, tweet_mode, texts):

    print("pairing tweets...")
    authors, tweet_ids, pairs = _pair_tweets(tweets, nb_authors,
                                             previous_authors,
                                             author_mode, nb_tweets_per_author,
                                             nb_pos_pairs_per_tweet,
                                             nb_neg_pairs_per_tweet,
                                             previous_tweets, tweet_mode)

    print("converting format...")
    dataset = _convert_format(pairs, texts)
    print("saving...")
    save_to_csv(dataset, "data/datasets/"+set_name+".csv", index_name)

    return authors, tweet_ids
示例#8
0
def main():
    """Main function that executes all of the data alterations and checks
    against the input data.
    First, the function iterates through all of the files in the data directory,
    and creates a dictionary object containing the file name in file_name and
    file contents in file.
    Afterwards, we check the encoding of the file
    with utils.process_encoding, and attempt to force convert the file to
    UTF-8 if it is not in that format already.
    Then, the delimiter of the file is taken from utils.get_file_delim.
    Next, utils.replace_newline is used to make the new line/line breaks to
    '\n'
    Finally, utils.row_by_row_check is run to parse each row and fix any
    common data issues.

    Args:
        There are no arguments as the program parses all the files in the
        data directory.

    Returns:
        None. For each file in the data director, a copy of its cleaned data
        is saved to the processed_data/cleaned_files directory, and a copy of
        its dirty data is saved to the processed_data/dirty_files directory.
    """
    for filename in os.listdir('data'):
        with open('data/' + filename) as raw_data:

            data_file = utils.get_file_metaddata(raw_data, filename)

            utf_encoding, data_file['file'] = utils.process_encoding(
                data_file['file'])
            if not utf_encoding:
                utils.save_to_csv(data_file, data_file['file'], 'bad_files')
                continue

            has_delim, delimiter = utils.get_file_delim(data_file['file'])
            if not has_delim:
                utils.save_to_csv(data_file, data_file['file'], 'bad_files')
                continue

            data_file['file'] = utils.replace_newline(data_file['file'])
            header = data_file['file'].pop(0)
            header_length = len(header.split(','))

            cleaned_data, dirty_data = utils.row_by_row_check(
                data_file['file'], delimiter, header_length)
            if cleaned_data:
                utils.save_to_csv(data_file, header + '\n' + cleaned_data,
                                  'cleaned_files')
            if dirty_data:
                utils.save_to_csv(data_file, header + '\n' + dirty_data,
                                  'dirty_files')
示例#9
0
    def saveTrace(self, checked):
        #diag = QFileDialog.getSaveFileName(self, "Select destination", "./", "Comma Separated Values (*.csv)");
        diag = QFileDialog(self)
        diag.setAcceptMode(QFileDialog.AcceptSave)  #Save file, not open one
        diag.setNameFilter(
            "Comma Separated Values (*.csv);;Space separated Values (*.csv)")
        diag.setDefaultSuffix("csv")
        # Make sure selected files end in .csv
        diag.exec()
        try:
            filename = diag.selectedFiles()[0]
        except IndexError:
            filename = ''
        user_filter = diag.selectedNameFilter()
        if (user_filter == "Space separated Values (*.csv)"):
            delimiter = " "
        else:
            delimiter = ","

        if (filename != '' and not os.path.isdir(filename)):

            npzfile = self.last_result
            t = npzfile["t"]
            cos2 = npzfile["cos2"]
            cos2d = npzfile["cos2d"]
            extra_header = []
            extra_columns = []
            if ('Javg' in npzfile.keys()):
                Javg = npzfile["Javg"]
                std = npzfile["std"]
                psi_pulse = npzfile["psi"]
                psi_final = psi_pulse[-1]
                psi_out = numpy.abs(psi_final)**2
                percentile_999 = npzfile["percentile_999"]
                extra_header = [
                    "<J>", "std(J)", "J_99.9%", "Probability coefficients"
                ]
                extra_columns = [Javg, std, percentile_999, psi_out]

            utils.save_to_csv(filename, t, cos2, cos2d, extra_header,
                              extra_columns, delimiter)
示例#10
0
 def scanimg(self,image_data):
     """
     提交表单
     需求base64图片
     """
     try:
         data = {"accuracy": self.quality, "image": image_data}
         response = requests.post(self.api_url, data=data, headers=self.header)
         if response.status_code != 200:
             print(time.ctime()[:-5], "Failed to get info")
             return None
         else:
             
             result = response.json()["words_result"]
             print("- ",result)
             invoice_data = {
                 '检索日期': '-'.join(time.ctime().split()[1:3]),
                 "发票类型": result["InvoiceType"],
                 '发票代码': result['InvoiceCode'],
                 '发票号码': result['InvoiceNum'],
                 '开票日期': result['InvoiceDate'],
                 '合计金额': result['TotalAmount'],
                 '税率': result['CommodityTaxRate'][0]['word'],
                 '合计税额': result['TotalTax'],
                 '价税合计': result['AmountInFiguers'],
                 '销售方名称': result['SellerName'],
                 '销售方税号': result['SellerRegisterNum'],
                 '购方名称': result['PurchaserName'],
                 '购方税号': result['PurchaserRegisterNum'],
                 '备注': result['Remarks'],
                 
             }
             save_to_csv(invoice_data, "test.csv")
             return invoice_data
     except:
         message = "发票识别API调用出现错误"
         # Pushover.push_message(message)
         return None
     finally:
         print(time.ctime()[:-5], "产生一次了调用")
示例#11
0
 def save_artist_data(self, artist_data):
     print('Save artist data:', artist_data.artist_information)
     utils.save_to_csv(artist_data.artist_information.name,
                       artist_data.events, 'events')
     utils.save_to_csv(artist_data.artist_information.name,
                       artist_data.setlists, 'setlists')
     utils.save_to_csv(artist_data.artist_information.name,
                       artist_data.recordings, 'recordings')
示例#12
0
    def egalitarian_score(self, save_plot=True, label=None):
        TIME_LIMIT = 5
        msgs_df = self.all_messages.copy()

        # determine if node was in sync
        for node in self.all_nodes:
            msgs_df[node + '_sync'] = (msgs_df[node].subtract(msgs_df['time']) < TIME_LIMIT).astype(int)

        # calculate how many nodes were in sync
        nodes_sync = [x+'_sync' for x in self.all_nodes]
        msgs_df['totals'] = msgs_df[nodes_sync].sum(axis=1)

        # calculate normalized value of how many nodes were out of sync
        msgs_df['egalitarian_score'] = (len(self.all_nodes) - msgs_df['totals']) / len(self.all_nodes)

        if self.plot_all:
            utils.save_to_csv(msgs_df, self.path, 'egalitarian_score')

        df_plot = msgs_df[msgs_df['time'] < (self.simulation_time - TIME_LIMIT)]
        plt.plot(df_plot['time'], df_plot['egalitarian_score'], label=label, linewidth=0.9, markevery=3.5)

        # calculate mean of egalitarian score: leave away last interval bc results can be wrong due to messages not yet delivered before simulation ends
        mean = df_plot['egalitarian_score'].mean()
        print('Egalitarian score: %.2f' % mean)

        if save_plot:
            # set y-axis from 0 to max
            axes = plt.gca()
            # axes.set_ylim([0, 0.2])

            plt.title('Egalitarian Score=%.2f' % mean)
            plt.ylabel('nodes')
            plt.xlabel('time (s)')
            plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
            plt.subplots_adjust(right=0.81)
            self.save_to_file('egalitarian_score', 'all')

        return mean
示例#13
0
    def saveTrace(self,checked):
        #diag = QFileDialog.getSaveFileName(self, "Select destination", "./", "Comma Separated Values (*.csv)");
        diag = QFileDialog(self);
        diag.setAcceptMode(QFileDialog.AcceptSave) #Save file, not open one
        diag.setNameFilter("Comma Separated Values (*.csv);;Space separated Values (*.csv)");
        diag.setDefaultSuffix("csv"); # Make sure selected files end in .csv
        diag.exec();
        try:
            filename = diag.selectedFiles()[0];
        except IndexError:
            filename = '';
        user_filter = diag.selectedNameFilter();
        if (user_filter == "Space separated Values (*.csv)"):
            delimiter = " ";
        else:
            delimiter = ",";

        if (filename != '' and not os.path.isdir(filename)):

            npzfile = self.last_result;
            t = npzfile["t"];
            cos2 = npzfile["cos2"];
            cos2d = npzfile["cos2d"];                
            extra_header = [];
            extra_columns = [];
            if ('Javg' in npzfile.keys()):
                Javg = npzfile["Javg"];
                std = npzfile["std"];
                psi_pulse = npzfile["psi"];
                psi_final = psi_pulse[-1];
                psi_out=numpy.abs(psi_final)**2;
                percentile_999 = npzfile["percentile_999"];
                extra_header = ["<J>","std(J)","J_99.9%","Probability coefficients"];
                extra_columns = [Javg,std,percentile_999,psi_out];


            utils.save_to_csv(filename,t,cos2,cos2d,extra_header,extra_columns,delimiter);
示例#14
0
    def _save_records_to_csv(self, records):
        """
        This will take the records and save it to a csv file

        Parameters:
        -----------'
        records: List[dict]

        Returns:
        -------
        path: str
            This will return the path of the csv file where the records has been
            saved
        """
        if records:
            try:
                path = save_to_csv(self.headers, self.csvfile, records)
                # this is a function that has a side effect
                # that is it stores the records to a file
                # so we return a json to say that the file is saved and the path
                # is this. Also additional information are given like the
                # headers and the date. This is mainly to save the information
                # to a log file later we will assert to check if this is the
                # same csv file was used to write the data
                assert self.csvfile == path

                self.log.debug("csv file {path}".format(path=path))
                return path
            except TypeError:
                information = "cannot save to csv as start date is not set"
                self.log.debug(information)
                raise ValueError(information)
            except AttributeError as e:
                p = "the return value of the functions has to be an iterable"
                self.log.debug(p)
                raise ValueError(p)
示例#15
0
    test_data = build_loader(mode='test', cfg=cfg).get_data()

    # pre process
    train = build_pre_process(data=train_data, mode='train',
                              cfg=cfg).get_feature()
    valid = build_pre_process(data=valid_data, mode='valid',
                              cfg=cfg).get_feature()
    test = build_pre_process(data=test_data, mode='test',
                             cfg=cfg).get_feature()
    features = [
        c for c in train.columns
        if c not in ['loadingOrder', 'label', 'mmin', 'mmax', 'count']
    ]

    # training
    meta = {
        'train': train,
        'valid': valid,
        'test': test,
        'pred': features,
        'label': 'label',
        'seed': 1080,
        'is_shuffle': True,
    }
    trainer = build_trainer(meta=meta, cfg=cfg)

    result = trainer.do_train()

    # save results
    save_to_csv(result, test_data, cfg)
示例#16
0
    delimiter = ","
    savepath = 'C:\Jmax_data/'
    for i in range(len(p)):
        INTENSITY = 0.6 * p[i] * 1e-6 / (tau * (waist**2))
        pulses = [config.laserpulse(INTENSITY, tau, 0, waist=31e-6)]
        t, cos2, cos2d, psi = dispatcher.dispatch(states,
                                                  pulses,
                                                  Jmax,
                                                  Nshells,
                                                  molecule,
                                                  dt,
                                                  t_end,
                                                  probe_waist,
                                                  calculate_cos2d,
                                                  do_psi_pulse=True)
        psi = psi[0]
        pdf = numpy.abs(psi)**2
        Js = numpy.arange(0, Jmax + 1)
        Jssq = Js**2
        Javg = numpy.sum(Js * pdf, axis=1)
        Jsq_avg = numpy.sum(Jssq * pdf, axis=1)
        std = numpy.sqrt(Jsq_avg - Javg**2)
        cdf = numpy.cumsum(numpy.abs(psi)**2, axis=1)
        percentile_999 = numpy.argmax(cdf >= 0.999, axis=1)
        psi_out = numpy.abs(psi[-1])**2
        extra_columns = [Javg, std, percentile_999, psi_out]
        filename = mol + '_p' + str(int(p[i])) + '_T' + str(
            tau * 1e12) + '.csv'
        utils.save_to_csv(savepath + filename, t, cos2, cos2d, extra_header,
                          extra_columns, delimiter)
示例#17
0
warnings.filterwarnings('ignore')
import re
from datetime import datetime
from utils import save_to_csv

DATA_PATH = '../data/'

FILE_NAME = 'reviews.csv'


def clean_data(reviews_df):
    for i in range(0, len(reviews_df)):
        reviews_df.date[i] = re.sub('Reviewed in India on', '',
                                    reviews_df.date[i])
        reviews_df.date[i] = reviews_df.date[i].strip()
        reviews_df.date[i] = datetime.strptime(reviews_df.date[i],
                                               '%d %B %Y').date()

    reviews_df['rating'] = reviews_df['rating'].astype('int')

    return reviews_df


if __name__ == '__main__':
    df = pd.read_csv(DATA_PATH + FILE_NAME)

    cleaned_df = clean_data(df)

    save_to_csv(cleaned_df, DATA_PATH, FILE_NAME)
def twitter_api_caller(keyword_user_search_param, keywords_list, ids,
                       batch_size, save_dir, csv_name):

    if keyword_user_search_param == 'search':
        csv_columns = [
            'id', 'username', 'text', 'keywords', 'date', 'location'
        ]
    else:
        csv_columns = ['id', 'username', 'text', 'date', 'location']

    try:
        os.chdir(os.path.join(ROOT_DIR, "scraped_tweet"))
        os.mkdir(save_dir)

        print("Directory 'final_tweet_csv' Created")
    except FileExistsError:
        print("Directory 'final_tweet_csv' already exists")

    n_chunks = int((len(ids) - 1) // batch_size + 1)

    tweets = []
    i = 0
    while i < n_chunks:

        if i > 0 and i % 300 == 0:
            # if batch number exceed 300 request could fail
            time.sleep(60)

        if i != n_chunks - 1:
            batch = ids[i * batch_size:(i + 1) * batch_size]
        else:
            batch = ids[i * batch_size:]

        print(f"Processing batch n° {i+1}/{n_chunks} ...")
        try:
            list_of_tw_status = api.statuses_lookup(batch,
                                                    tweet_mode="extended")
        except RateLimitError as err:
            print('Tweepy: Rate Limit exceeded')
            # https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/faq
            save_to_csv(tweets, os.path.join("scraped_tweet", save_dir),
                        f"{csv_name}_last_batch_{i}", csv_columns)
            break
        except Exception as err:
            save_to_csv(tweets, os.path.join("scraped_tweet", save_dir),
                        f"{csv_name}_last_batch_{i}", csv_columns)
            print(f"General Error: {str(err)}")
            break

        tweets_batch = []
        for status in list_of_tw_status:
            try:
                tweet = {
                    "id": status.id,
                    "username": status.user.screen_name,
                    "text": status.full_text,
                    "date": str(status.created_at),
                    "location": status.user.location
                }

                if keyword_user_search_param == 'search':
                    kl1 = [
                        e for e in keywords_list
                        if e.lower() in status.full_text.lower()
                    ]
                    kl2 = [
                        e for e in keywords_list
                        if e.lower() in status.user.screen_name.lower()
                    ]
                    keywords = [x for x in set(kl1 + kl2) if len(x) > 0]
                    tweet["keywords"] = keywords

            except Exception as err:
                print(f"General Error: {str(err)}")
                continue
            tweets_batch.append(tweet)
        print(f"Processed - scraped {len(tweets_batch)} tweets.")
        if len(tweets_batch) == 0:
            save_to_csv(tweets, os.path.join("scraped_tweet", save_dir),
                        f"{csv_name}_last_batch_{i}", csv_columns)
            print("No tweets scraped")
            break

        i += 1
        tweets.append(tweets_batch)

    save_to_csv(tweets, os.path.join("scraped_tweet", save_dir), csv_name,
                csv_columns)
    try:
        os.mkdir("data")
    except OSError:
        pass

    if (not store_csv):
        if (do_psi_pulse):
            numpy.savez(filename,
                        t=t,
                        cos2=cos2,
                        cos2d=cos2d,
                        Javg=Javg,
                        std=std,
                        percentile_999=percentile_999,
                        psi=psi_pulse)
        else:
            numpy.savez(filename, t=t, cos2=cos2, cos2d=cos2d)
    else:
        if (out_filename == ""):
            filename = filename.replace("npz", "csv")
        if (not do_psi_pulse):
            utils.save_to_csv(filename, t, cos2, cos2d)
        else:
            utils.save_to_csv(filename, t, cos2, cos2d,
                              ["<J>", "std(J)", "J_99.9%"],
                              [Javg, std, percentile_999])

    if (out_filename == ""):
        print("Saved trace in " + filename)
示例#20
0
        os.path.basename(args.pulses), Nshells, T, probe_waist
    ]

    filename = out_filename
    if (filename == ""):
        filename = "data/" + ','.join([str(i) for i in attributes]) + ".npz"

    meta = dict()
    meta['molecule'] = molecule
    meta['Jmax'] = Jmax
    meta['dt'] = dt
    meta['pulses'] = pulses
    meta['Nshells'] = Nshells
    meta['temperature'] = T
    meta['probe_waist'] = probe_waist

    try:
        os.mkdir("data")
    except OSError:
        pass

    if (not store_csv):
        numpy.savez(filename, t=t, cos2=cos2, cos2d=cos2d, meta=meta)
    else:
        if (out_filename == ""):
            filename = filename.replace("npz", "csv")
        utils.save_to_csv(filename, t, cos2, cos2d)

    if (out_filename == ""):
        print("Saved trace in " + filename)
示例#21
0
import utils

if __name__ == '__main__':

    dir = "log/peps normal"
    pattern = r'(left|right|front|back|start|trunk|lock)\\\d{1,2}.csv$'
    pattern_valid = r'(3|6|9|12).csv$'
    utils.construct_set(dir, pattern, pattern_valid, filter=1)
    utils.save_to_csv()

    X, y = utils.load_all()
    X_train, X_valid, y_train, y_valid = utils.load_train_valid()

    # compare train result
    methods = ["Logistic", "LDA", "QDA", "KNN", "SVM", "RF", "GBM", "MLP"]
    params = [
        None, None, None, {
            "n_neighbors": 10
        }, {
            "C": 0.25,
            "gamma": 0.5
        }, {
            "max_features": 2,
            "n_estimators": 100
        }, {
            "n_estimators": 400,
            "max_depth": 3
        }, {
            "hidden_layer_sizes": (16, 8)
        }
    ]
    t_end = 30e-12

    cache = dict();
    states = [(1,0,0,0,1)];
    extra_header = ["<J>","std(J)","J_99.9%","Probability coefficients"];
    delimiter = ","
    savepath='C:\Jmax_data/'
    for i in range(len(p)):
        INTENSITY=0.6*p[i]*1e-6/(tau*(waist**2))
        pulses = [config.laserpulse(INTENSITY,tau,0,waist=31e-6)];
        t,cos2,cos2d,psi = dispatcher.dispatch(states,pulses,Jmax,Nshells,molecule,dt,t_end,probe_waist,calculate_cos2d,do_psi_pulse=True)
        psi=psi[0]
        pdf = numpy.abs(psi)**2;  
        Js = numpy.arange(0,Jmax+1);
        Jssq = Js**2;
        Javg = numpy.sum(Js*pdf,axis=1);
        Jsq_avg = numpy.sum(Jssq*pdf,axis=1); 
        std = numpy.sqrt(Jsq_avg - Javg**2);
        cdf = numpy.cumsum(numpy.abs(psi)**2,axis=1);
        percentile_999 = numpy.argmax(cdf>=0.999,axis=1);
        psi_out=numpy.abs(psi[-1])**2
        extra_columns = [Javg,std,percentile_999,psi_out];
        filename=mol+'_p'+str(int(p[i]))+'_T'+str(tau*1e12)+'.csv'
        utils.save_to_csv(savepath+filename,t,cos2,cos2d,extra_header,extra_columns,delimiter);    





def twitter_api_caller(keyword_user_search_param, keywords_list, ids,
                       batch_size, save_dir, csv_name, collect_replies):
    if keyword_user_search_param == 'search':
        csv_columns = [
            'id', 'username', 'text', 'keywords', 'date', 'location'
        ]
    else:
        csv_columns = ['id', 'username', 'text', 'date', 'location']

    if collect_replies:
        csv_columns.append('replies')

    try:
        os.chdir(SCRAPED_TWEET_PATH)
        os.mkdir(save_dir)

        print("Directory 'final_tweet_csv' Created")
    except FileExistsError:
        print("Directory 'final_tweet_csv' already exists")

    n_chunks = int((len(ids) - 1) // batch_size + 1)

    tweets = []
    i = 0
    while i < n_chunks:

        if i > 0 and i % 300 == 0:
            # if batch number exceed 300 request could fail
            time.sleep(60)

        if i != n_chunks - 1:
            batch = ids[i * batch_size:(i + 1) * batch_size]
        else:
            batch = ids[i * batch_size:]

        print(f"Processing batch n° {i + 1}/{n_chunks} ...")
        try:
            list_of_tw_status = api.statuses_lookup(batch,
                                                    tweet_mode="extended")
        except RateLimitError as err:
            print('Tweepy: Rate Limit exceeded')
            # https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/faq
            save_to_csv(tweets, save_dir, f"{csv_name}_last_batch_{i}",
                        csv_columns)
            break
        except Exception as err:
            save_to_csv(tweets, save_dir, f"{csv_name}_last_batch_{i}",
                        csv_columns)
            print(f"General Error: {str(err)}")
            break

        tweets_batch = []
        for status in list_of_tw_status:
            try:
                tweet = {
                    "id": status.id,
                    "username": status.user.screen_name,
                    "text": status.full_text.replace('\n', ' '),
                    "date": str(status.created_at),
                    "location": status.user.location
                }

                if keyword_user_search_param == 'search':
                    keywords_in_tweet = get_tweet_keywords(
                        keywords_list, status)

                    tweet["keywords"] = list(set(keywords_in_tweet))

                if collect_replies:
                    replies = collect_tweet_replies(status.id,
                                                    max_num_replies=100)
                    tweet['replies'] = replies

            except Exception as err:
                print(f"General Error: {str(err)}")
                continue
            tweets_batch.append(tweet)
        print(f"Processed - scraped {len(tweets_batch)} tweets.")
        if len(tweets_batch) == 0:
            save_to_csv(tweets, save_dir, f"{csv_name}_last_batch_{i}",
                        csv_columns)
            print("No tweets scraped")
            break

        i += 1
        tweets.append(tweets_batch)

    save_to_csv(tweets, save_dir, csv_name, csv_columns)
示例#24
0
    def train(self,
              train_data,
              test_data,
              prediction_data,
              epochs,
              restore_checkpoint=False,
              csv_name="transformer_data.csv"):
        """
            Training method that uses distributed training
            
            parameters:
                train_data - input data for training. Should be in form : en_train, fr_train_in, fr_train_out
                test_data - input data for test step. Should be in form : en_test, fr_test_in, fr_test_out
                prediction_data - input data for prediction step. Should be in form of: en_predict, fr_predict
                epochs - number of epochs that should be run
                restore_checkpoint - should we restore last checkpoint and resume training. Default set to false.
                csv_name - name of csv file where losses/accuracies will be saved. default = transformer_data.csv.
                           If restore_checkpoint is set to False, file will be erased and only current run will be present.
                
            Returns:
                tuple losses, accuracy where losses = (train_losses, test_losses), accuracy = (train-accuracy, test_accuracy)
        """

        en_predict, fr_predict = prediction_data
        en_vocab_size = self.en_tokenizer.vocab_size
        fr_vocab_size = self.fr_tokenizer.vocab_size + 2

        print('Number of devices: {}'.format(
            self.strategy.num_replicas_in_sync))
        GLOBAL_BATCH_SIZE = self.batch_size * self.strategy.num_replicas_in_sync

        train_dataset_distr, test_dataset_distr = makeDatasets(
            train_data, test_data, GLOBAL_BATCH_SIZE, self.strategy)

        test_losses = []
        train_losses = []
        train_accuracyVec = []
        test_accuracyVec = []
        test_loss = tf.keras.metrics.Mean()
        test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
        train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

        prediction_idx = np.random.randint(low=0, high=len(en_predict),
                                           size=1)[0]
        prediction_en, prediction_fr = en_predict[prediction_idx], fr_predict[
            prediction_idx]
        print("prediction input : ", prediction_en)
        print("prediction output: ", prediction_fr)

        with self.strategy.scope():
            custom_learning_rate = customLearningRate(warmup_steps=4000,
                                                      d_model=self.d_model)

            self.optimizer = tf.keras.optimizers.Adam(
                learning_rate=custom_learning_rate,
                beta_1=0.9,
                beta_2=0.98,
                epsilon=1e-9)

            self.transformer_model = Transformer(
                embedding_size=self.d_model,
                dff=self.dff,
                input_max_seq_length=2000,
                output_max_seq_length=1855,
                input_vocab_size=en_vocab_size,
                output_vocab_size=fr_vocab_size,
                encoder_blocks=self.num_layers,
                decoder_blocks=self.num_layers,
                heads=self.num_heads)

            ckpt = tf.train.Checkpoint(transformer=self.transformer_model,
                                       optimizer=self.optimizer,
                                       epoch=tf.Variable(1))

            manager = tf.train.CheckpointManager(ckpt,
                                                 self.checkpoint_path,
                                                 max_to_keep=5)

            if manager.latest_checkpoint and restore_checkpoint:
                ckpt.restore(manager.latest_checkpoint)
                print('Latest checkpoint restored!!')
            else:
                print("training from scratch")

            loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
                from_logits=True, reduction="none")

            def loss_fn(real, targets):
                mask = tf.math.logical_not(tf.math.equal(targets, 0))
                mask = tf.cast(mask, tf.int64)
                per_example_loss = loss_object(targets,
                                               real,
                                               sample_weight=mask)
                return tf.nn.compute_average_loss(
                    per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE)

            def train_step(input_data, real_data_in, real_data_out):
                encoder_pad_mask = makePaddingMask(input_data)
                elements_mask = makeSequenceMask(real_data_in.shape[1])
                with tf.GradientTape() as tape:
                    predicted_data = self.transformer_model(
                        input_data,
                        real_data_in,
                        encoder_pad_mask,
                        elements_mask,
                        training_enabled=True,
                        training=True)
                    loss = loss_fn(predicted_data, real_data_out)

                trainable_vars = self.transformer_model.trainable_variables
                grads = tape.gradient(loss, trainable_vars)
                self.optimizer.apply_gradients(zip(grads, trainable_vars))
                train_accuracy.update_state(real_data_out, predicted_data)
                return loss

            @tf.function
            def distributed_train_step(input_data, real_data_in,
                                       real_data_out):
                per_replica_losses = self.strategy.experimental_run_v2(
                    train_step, args=(input_data, real_data_in, real_data_out))
                return self.strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_losses,
                                            axis=None)

            def test_step(input_data, real_data_in, real_data_out):
                encoder_pad_mask = makePaddingMask(input_data)
                elements_mask = makeSequenceMask(real_data_in.shape[1])
                predicted_data = self.transformer_model(input_data,
                                                        real_data_in,
                                                        encoder_pad_mask,
                                                        elements_mask,
                                                        training_enabled=False,
                                                        training=False)
                loss = loss_fn(predicted_data, real_data_out)

                test_accuracy.update_state(real_data_out, predicted_data)
                return loss

            @tf.function
            def distributed_test_step(input_data, real_data_in, real_data_out):
                per_replica_losses = self.strategy.experimental_run_v2(
                    test_step,
                    args=(
                        input_data,
                        real_data_in,
                        real_data_out,
                    ))
                return self.strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_losses,
                                            axis=None)

            for epoch in range(epochs):
                total_loss = 0
                num_batches = 0
                test_loss.reset_states()
                test_accuracy.reset_states()
                train_accuracy.reset_states()

                for _, (en_data, fr_data_in,
                        fr_train_out) in enumerate(train_dataset_distr):
                    loss = distributed_train_step(en_data, fr_data_in,
                                                  fr_train_out)
                    total_loss += loss
                    num_batches += 1
                train_losses.append(total_loss / num_batches)

                total_loss = 0
                num_batches = 0
                for _, (en_data, fr_data_in,
                        fr_data_out) in enumerate(test_dataset_distr):
                    loss = distributed_test_step(en_data, fr_data_in,
                                                 fr_data_out)
                    total_loss += loss
                    num_batches += 1
                test_losses.append(total_loss / num_batches)

                print ('Epoch {} training Loss {:.4f} Accuracy {:.4f}  test Loss {:.4f} Accuracy {:.4f}' .format( \
                                                            epoch + 1,
                                                            train_losses[-1],
                                                            train_accuracy.result(),
                                                            test_losses[-1],
                                                            test_accuracy.result()))
                train_accuracyVec.append(train_accuracy.result())
                test_accuracyVec.append(test_accuracy.result())

                if epoch % self.predict_every == 0 and epoch != 0:
                    output_seq = self.translate(prediction_en)
                    print(
                        "----------------------------PREDICTION----------------------------"
                    )
                    print("Predicted :", output_seq)
                    print("Correct   :", prediction_fr)
                    print(
                        "--------------------------END PREDICTION--------------------------"
                    )

                ckpt.epoch.assign_add(1)
                if int(epoch) % 5 == 0:
                    save_path = manager.save()
                    print("Saving checkpoint for epoch {}: {}".format(
                        epoch, save_path))

            save_path = manager.save()
            print('Saving checkpoint for end at {}'.format(save_path))
            save_to_csv(losses=(train_losses, test_losses),
                        accuracy=(train_accuracyVec, test_accuracyVec),
                        append=restore_checkpoint,
                        file_name=csv_name)

            return (train_losses, test_losses), (train_accuracyVec,
                                                 test_accuracyVec)
示例#25
0
    def train(self,
              train_data,
              test_data,
              prediction_data,
              epochs,
              restore_checkpoint=False,
              csv_name="seq2seq_data.csv"):
        """
            Training method that uses distributed training
            
            Parameters:
                train_data - input data for training. Should be in form : en_train, fr_train_in, fr_train_out
                test_data - input data for test step. Should be in form : en_test, fr_test_in, fr_test_out
                prediction_data - input data for prediction step. Should be in form of: en_predict, fr_predict
                epochs - number of epochs that should be run
                restore_checkpoint - should we restore last checkpoint and resume training. Defualt set to false.
                csv_name - name of csv file where losses/accuracies will be saved. default = seq2seq_data.csv.
                           If restore_checkpoint is set to False, file will be erased and only current run will be present.
                
            Returns:
                tuple losses, accuracy where losses = (train_losses, test_losses), accuracy = (train-accuracy, test_accuracy)
        """

        en_predict, fr_predict = prediction_data
        en_vocab_size = self.en_tokenizer.vocab_size
        fr_vocab_size = self.fr_tokenizer.vocab_size + 2

        print('Number of devices: {}'.format(
            self.strategy.num_replicas_in_sync))
        GLOBAL_BATCH_SIZE = self.batch_size * self.strategy.num_replicas_in_sync

        train_dataset_distr, test_dataset_distr = makeDatasets(
            train_data, test_data, GLOBAL_BATCH_SIZE, self.strategy)

        test_losses = []
        train_losses = []
        train_accuracyVec = []
        test_accuracyVec = []
        test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
        train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

        prediction_idx = np.random.randint(low=0, high=len(en_predict),
                                           size=1)[0]
        prediction_en, prediction_fr = en_predict[prediction_idx], fr_predict[
            prediction_idx]
        print("input : ", prediction_en)
        print("output: ", prediction_fr)

        with self.strategy.scope():
            self.optimizer = tf.keras.optimizers.Adam(clipnorm=5.0)
            self.encoder = Encoder(self.lstm_size, self.embedding_size,
                                   en_vocab_size)
            self.decoder = Decoder(self.lstm_size, self.embedding_size,
                                   fr_vocab_size)

            ckpt = tf.train.Checkpoint(encoder=self.encoder,
                                       decoder=self.decoder,
                                       optimizer=self.optimizer,
                                       epoch=tf.Variable(1))

            manager = tf.train.CheckpointManager(ckpt,
                                                 "./checkpoints/Seq2Seq",
                                                 max_to_keep=5)

            if manager.latest_checkpoint and restore_checkpoint:
                ckpt.restore(manager.latest_checkpoint)
                print('Latest checkpoint restored!!')
            else:
                print("training from scratch")

            loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(
                from_logits=True, reduction="none")

            def compute_loss(predictions, labels):
                mask = tf.math.logical_not(tf.math.equal(labels, 0))
                mask = tf.cast(mask, tf.int64)
                per_example_loss = loss_obj(labels,
                                            predictions,
                                            sample_weight=mask)
                return tf.nn.compute_average_loss(
                    per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE)

            # one training step
            def train_step(encoder_input, decoder_in, decoder_out,
                           initial_states):
                with tf.GradientTape() as tape:
                    encoder_states = self.encoder(encoder_input,
                                                  initial_state,
                                                  training_mode=True)
                    predicted_data, _, _ = self.decoder(decoder_in,
                                                        encoder_states[1:],
                                                        training_mode=True)
                    loss = compute_loss(predicted_data, decoder_out)

                trainable = self.encoder.trainable_variables + self.decoder.trainable_variables
                grads = tape.gradient(loss, trainable)
                self.optimizer.apply_gradients(zip(grads, trainable))
                train_accuracy.update_state(decoder_out, predicted_data)
                return loss

            @tf.function
            def distributed_train_step(encoder_input, decoder_in, decoder_out,
                                       initial_states):
                per_replica_losses = self.strategy.experimental_run_v2(
                    train_step,
                    args=(
                        encoder_input,
                        decoder_in,
                        decoder_out,
                        initial_states,
                    ))
                return self.strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_losses,
                                            axis=None)

            def test_step(encoder_input, decoder_in, decoder_out):
                initial_state = self.encoder.init_states(self.batch_size)
                encoder_states = self.encoder(encoder_input,
                                              initial_state,
                                              training_mode=False)
                predicted_data, _, _ = self.decoder(decoder_in,
                                                    encoder_states[1:],
                                                    training_mode=False)
                loss = compute_loss(predicted_data, decoder_out)

                test_accuracy.update_state(decoder_out, predicted_data)
                return loss

            @tf.function
            def distributed_test_step(encoder_input, decoder_in, decoder_out):
                per_replica_losses = self.strategy.experimental_run_v2(
                    test_step, args=(
                        encoder_input,
                        decoder_in,
                        decoder_out,
                    ))
                return self.strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_losses,
                                            axis=None)

            print(
                "starting training with {} epochs with prediction each {} epoch"
                .format(epochs, self.predict_every))
            for epoch in range(epochs):
                test_accuracy.reset_states()
                train_accuracy.reset_states()
                initial_state = self.encoder.init_states(self.batch_size)
                total_loss = 0.0
                num_batches = 0
                for _, (en_data, fr_data_in,
                        fr_data_out) in enumerate(train_dataset_distr):
                    loss = distributed_train_step(en_data, fr_data_in,
                                                  fr_data_out, initial_state)
                    total_loss += loss
                    num_batches += 1
                train_losses.append(total_loss / num_batches)
                total_loss = 0.0
                num_batches = 0
                for _, (en_data, fr_data_in,
                        fr_data_out) in enumerate(test_dataset_distr):
                    loss = distributed_test_step(en_data, fr_data_in,
                                                 fr_data_out)
                    total_loss += loss
                    num_batches += 1
                test_losses.append(total_loss / num_batches)
                print(
                    'Epoch {} training Loss {:.4f} Accuracy {:.4f}  test Loss {:.4f} Accuracy {:.4f}'
                    .format(epoch + 1, train_losses[-1],
                            train_accuracy.result(), test_losses[-1],
                            test_accuracy.result()))
                train_accuracyVec.append(train_accuracy.result())
                test_accuracyVec.append(test_accuracy.result())
                ckpt.epoch.assign_add(1)
                if int(epoch) % 5 == 0:
                    save_path = manager.save()
                    print("Saving checkpoint for epoch {}: {}".format(
                        epoch, save_path))

                if epoch % self.predict_every == 0 and epoch != 0:
                    output_seq = self.translate(prediction_en)
                    print(
                        "----------------------------PREDICTION----------------------------"
                    )
                    print("Predicted :", output_seq)
                    print("Correct   :", prediction_fr)
                    print(
                        "--------------------------END PREDICTION--------------------------"
                    )

        save_path = manager.save()
        print('Saving checkpoint for end at {}'.format(save_path))
        save_to_csv(losses=(train_losses, test_losses),
                    accuracy=(train_accuracyVec, test_accuracyVec),
                    append=restore_checkpoint,
                    file_name=csv_name)

        return (train_losses, test_losses), (train_accuracyVec,
                                             test_accuracyVec)
def main():
    tweets = _read_tweets_to_dataframe("data/tweet_data/", True, 2000)
    make_new_dir("data/datasets")
    save_to_csv(tweets, "data/datasets/individual_tweets.csv", "tweet_id")
示例#27
0
    def train(self,
              train_data,
              test_data,
              prediction_data,
              epochs,
              attention_type="general",
              restore_checkpoint=False,
              csv_name="seq2seqAttention_data.csv"):
        """
            Training method that uses distributed training
            
            Parameters:
                train_data - input data for training. Should be in form : en_train, fr_train_in, fr_train_out
                test_data - input data for test step. Should be in form : en_test, fr_test_in, fr_test_out
                prediction_data - input data for prediction step. Should be in form of: en_predict, fr_predict
                epochs - number of epochs that should be run
                attention_type - what attention method to use " dot/general/concat. Default - general
                restore_checkpoint - should we restore last checkpoint and resume training. Defualt set to false.
                csv_name - name of csv file where losses/accuracies will be saved. default = seq2seqAttention_data.csv.
                  If restore_checkpoint is set to False, file will be erased and only current run will be present.
            
            Returns:
                tuple losses, accuracy where losses = (train_losses, test_losses), accuracy = (train-accuracy, test_accuracy)
        """

        print_heatmap = True

        en_predict, fr_predict = prediction_data
        en_vocab_size = self.en_tokenizer.vocab_size
        fr_vocab_size = self.fr_tokenizer.vocab_size + 2

        print('Number of devices: {}'.format(
            self.strategy.num_replicas_in_sync))
        GLOBAL_BATCH_SIZE = self.batch_size * self.strategy.num_replicas_in_sync

        train_dataset_distr, test_dataset_distr = makeDatasets(
            train_data, test_data, GLOBAL_BATCH_SIZE, self.strategy)

        test_losses = []
        train_losses = []
        train_accuracyVec = []
        test_accuracyVec = []
        test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
        train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

        prediction_idx = np.random.randint(low=0, high=len(en_predict),
                                           size=1)[0]
        prediction_en, prediction_fr = en_predict[prediction_idx], fr_predict[
            prediction_idx]
        print("prediction input : ", prediction_en)
        print("prediction output: ", prediction_fr)

        if not os.path.exists("heatmap"):
            os.mkdir("heatmap")

        alignments = []

        with self.strategy.scope():
            self.encoder = Encoder(lstm_size=self.lstm_size,
                                   embedding_size=self.embedding_size,
                                   vocab_size=en_vocab_size)

            self.decoder = Decoder(lstm_size=self.lstm_size,
                                   embedding_size=self.embedding_size,
                                   vocab_size=fr_vocab_size,
                                   attention_type=attention_type)

            self.optimizer = tf.keras.optimizers.Adam(clipnorm=0.5)

            ckpt = tf.train.Checkpoint(encoder=self.encoder,
                                       decoder=self.decoder,
                                       optimizer=self.optimizer,
                                       epoch=tf.Variable(1))

            manager = tf.train.CheckpointManager(
                ckpt, "./checkpoints/Seq2SeqAttention", max_to_keep=5)

            if manager.latest_checkpoint and restore_checkpoint:
                ckpt.restore(manager.latest_checkpoint)
                print('Latest checkpoint restored!!')
            else:
                print("training from scratch")

            loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(
                from_logits=True, reduction="none")

            def compute_loss(predictions, labels):
                mask = tf.math.logical_not(tf.math.equal(labels, 0))
                mask = tf.cast(mask, tf.int64)
                per_example_loss = loss_obj(labels,
                                            predictions,
                                            sample_weight=mask)
                return tf.nn.compute_average_loss(
                    per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE)

            # one training step
            def train_step(en_data, fr_data_in, fr_data_out, initial_states):
                loss = 0
                predicted_output = None
                train_accuracy.reset_states()
                with tf.GradientTape() as tape:
                    encoder_output, state_h, state_c = self.encoder(
                        en_data, initial_states, training_mode=True)
                    # shape[1] because we want each word for all batches
                    for i in range(fr_data_out.shape[1]):
                        decoder_input = tf.expand_dims(fr_data_in[:, i], 1)
                        decoder_output, state_h, state_c, _ = self.decoder(
                            decoder_input, (state_h, state_c),
                            encoder_output,
                            training_mode=True)

                        loss += compute_loss(decoder_output, fr_data_out[:, i])
                        decoder_output = tf.expand_dims(decoder_output, axis=1)
                        if i == 0:
                            predicted_output = decoder_output
                        else:
                            predicted_output = tf.concat(
                                [predicted_output, decoder_output], axis=1)

                trainable_vars = self.encoder.trainable_variables + self.decoder.trainable_variables
                grads = tape.gradient(loss, trainable_vars)
                self.optimizer.apply_gradients(zip(grads, trainable_vars))

                train_accuracy.update_state(fr_data_out, predicted_output)

                return loss / fr_data_out.shape[1]

            @tf.function
            def distributed_train_step(en_data, fr_data_in, fr_data_out,
                                       initial_states):
                per_replica_losses = self.strategy.experimental_run_v2(
                    train_step,
                    args=(
                        en_data,
                        fr_data_in,
                        fr_data_out,
                        initial_states,
                    ))
                return self.strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_losses,
                                            axis=None)

            def test_step(en_data, fr_data_in, fr_data_out):
                loss = 0
                predicted_output = []
                initial_states = self.encoder.init_states(self.batch_size)
                encoder_output, state_h, state_c = self.encoder(
                    en_data, initial_states, training_mode=False)
                for i in range(fr_data_out.shape[1]):
                    decoder_input = tf.expand_dims(fr_data_in[:, i], 1)
                    decoder_output, state_h, state_c, _ = self.decoder(
                        decoder_input, (state_h, state_c),
                        encoder_output,
                        training_mode=False)
                    loss += compute_loss(decoder_output, fr_data_out[:, i])

                    decoder_output = tf.expand_dims(decoder_output, axis=1)
                    if i == 0:
                        predicted_output = decoder_output
                    else:
                        predicted_output = tf.concat(
                            [predicted_output, decoder_output], axis=1)

                test_accuracy.update_state(fr_data_out, predicted_output)

                return loss / fr_data_out.shape[1]

            @tf.function
            def distributed_test_step(en_data, fr_data_in, fr_data_out):
                per_replica_losses = self.strategy.experimental_run_v2(
                    test_step, args=(
                        en_data,
                        fr_data_in,
                        fr_data_out,
                    ))
                return self.strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_losses,
                                            axis=None)

            print(
                "starting training with {} epochs with prediction each {} epoch"
                .format(epochs, self.predict_every))
            for epoch in range(epochs):
                test_accuracy.reset_states()
                train_accuracy.reset_states()
                initial_states = self.encoder.init_states(self.batch_size)

                total_loss = 0.0
                num_batches = 0
                for _, (en_data, fr_data_in,
                        fr_data_out) in enumerate(train_dataset_distr):
                    loss = distributed_train_step(en_data, fr_data_in,
                                                  fr_data_out, initial_states)
                    total_loss += loss
                    num_batches += 1
                train_losses.append(total_loss / num_batches)
                total_loss = 0.0
                num_batches = 0
                for _, (en_data, fr_data_in,
                        fr_data_out) in enumerate(test_dataset_distr):
                    loss = distributed_test_step(en_data, fr_data_in,
                                                 fr_data_out)
                    total_loss += loss
                    num_batches += 1

                test_losses.append(total_loss / num_batches)
                print ('Epoch {} training Loss {:.4f} Accuracy {:.4f}  test Loss {:.4f} Accuracy {:.4f}' .format( \
                                                  epoch + 1,
                                                  train_losses[-1],
                                                  train_accuracy.result(),
                                                  test_losses[-1],
                                                  test_accuracy.result()))

                train_accuracyVec.append(train_accuracy.result())
                test_accuracyVec.append(test_accuracy.result())
                ckpt.epoch.assign_add(1)

                if int(epoch) % 5 == 0:
                    save_path = manager.save()
                    print("Saving checkpoint for epoch {}: {}".format(
                        epoch, save_path))

                predicted, alignment = self.translate(prediction_en)

                if epoch % self.predict_every == 0:
                    print(
                        "----------------------------PREDICTION----------------------------"
                    )
                    print("Predicted:  {} ".format(predicted))
                    print("Should be:  {} ".format(prediction_fr))
                    print(
                        "--------------------------END PREDICTION--------------------------"
                    )

                if print_heatmap:
                    attention_map = np.squeeze(alignment, (1, 2))
                    alignments.append(attention_map)
                    fig = plt.figure(figsize=(10, 10))
                    ax = fig.add_subplot(1, 1, 1)
                    ax.matshow(attention_map, cmap='jet')
                    ax.set_xticklabels([''] + prediction_en.split(' '),
                                       rotation=90)
                    ax.set_yticklabels([''] + predicted.split(' '))

                    plt.savefig('heatmap/prediction_{}.png'.format(epoch))
                    #plt.show()
                    plt.close()

        save_path = manager.save()
        print('Saving checkpoint for end at {}'.format(save_path))
        save_to_csv(losses=(train_losses, test_losses),
                    accuracy=(train_accuracyVec, test_accuracyVec),
                    append=restore_checkpoint,
                    file_name=csv_name)

        return (train_losses, test_losses), (train_accuracyVec,
                                             test_accuracyVec)
示例#28
0
import os
import sys

import utils

if __name__ == '__main__':
    print('main')
    if len(sys.argv) < 2:
        raise Exception('Enter the path of the directory')

    path = sys.argv[1]

    if not os.path.isdir(path):
        raise Exception('Invalid path : ', path)
    print('path', path)
    for idx, filename in enumerate(os.listdir(path)):
        # print(filename)
        tags, text = utils.extract_tags_risk(os.path.join(path, filename))

        words = utils.extract_words(text, tags, idx)

        df = utils.map_output(words)
        name = filename.split('.')[0]
        utils.save_to_csv('.', f'{name}.csv', df)
    attributes = [molecule.name,Jmax,J,K,M,dt,os.path.basename(args.pulses),Nshells,probe_waist];

    filename = out_filename;
    if (filename == ""):
        filename = ','.join([str(i) for i in attributes]) + ".npz";
        filename = "data/single_state_" + filename;

    try:
        os.mkdir("data");
    except OSError:
        pass;

    if (not store_csv):
     if (do_psi_pulse):
        numpy.savez(filename,t=t,cos2=cos2,cos2d=cos2d,Javg=Javg,std=std,percentile_999=percentile_999,psi=psi_pulse);
     else:
        numpy.savez(filename,t=t,cos2=cos2,cos2d=cos2d);
    else:
        if (out_filename == ""):
            filename = filename.replace("npz","csv");
        if (not do_psi_pulse):
            utils.save_to_csv(filename,t,cos2,cos2d);
        else:
            utils.save_to_csv(filename,t,cos2,cos2d,["<J>","std(J)","J_99.9%"],[Javg,std,percentile_999]);

    if (out_filename == ""):
        print("Saved trace in "+filename);



示例#30
0
def create_dataframe(pasin, name, dates, stars, reviews):
    reviews_dict = {
        'asin': pasin,
        'name': name,
        'date': dates,
        'rating': stars,
        'review': reviews
    }

    reviews_df = pd.DataFrame(
        data=reviews_dict,
        columns=['asin', 'name', 'date', 'rating', 'review'])

    return reviews_df


if __name__ == '__main__':

    company_list = get_company_list()

    asin = get_product_asin(headers, company_list)

    link = get_product_links(headers, company_list, asin)

    pasin, name, dates, stars, reviews = get_product_details(
        headers, company_list, asin, link)

    reviews_df = create_dataframe(pasin, name, dates, stars, reviews)

    save_to_csv(reviews_df, DATA_PATH, FILE_NAME)