Python ensure_dir 예제들, helper.ensure_dir Python 예제들

예제 #1

0

파일 보기

파일: plot_single_scenario.py 프로젝트: mpeuster/estate

def plot(experiment, output_dir="evaluation/single_scenario", input_dir="results/"):
    # setup directories for this plot
    input_dir = os.path.join(experiment, input_dir)
    output_dir = os.path.join(experiment, output_dir)
    ensure_dir(output_dir, rm=True)
    print input_dir
    print output_dir

    # load data
    ed = data.ExperimentData(path=input_dir)
    ed.normalize_times()

    # go over all scenarios and call plot methods
    for s in ed.scenarios.itervalues():
        single_scenario_plot(
            s, output_dir, yfield=["pps_local", "pps_global"],
            yname="packets per second", xlim=120, label_rename_func=label_rename_generic_performance)
        single_scenario_plot(
            s, output_dir, yfield=["pcount_local", "pcount_global"],
            yname="# pattern matches", xlim=120, label_rename_func=label_rename_matchexample)
        single_scenario_plot(
            s, output_dir, yfield=["matchcount_local", "matchcount_global"],
            yname="# pattern matches", xlim=120, label_rename_func=label_rename_matchexample)
        single_scenario_plot(
            s, output_dir, yfield=["t_request_local", "t_request_global"],
            yname="state request delay [s]", xlim=120)

예제 #2

0

파일 보기

파일: gui.py 프로젝트: Divyanshudb/Attendance_Minor_Project

 def retranslateUi(self, MainWindow):
     _translate = QtCore.QCoreApplication.translate
     MainWindow.setWindowTitle(_translate("MainWindow", "Add Teacher"))
     self.label.setText(_translate("MainWindow", "Title:"))
     self.label_2.setText(_translate("MainWindow", "First Name:"))
     self.label_3.setText(_translate("MainWindow", "Last Name:"))
     self.label_4.setText(_translate("MainWindow", "Designation:"))
     self.label_5.setText(_translate("MainWindow", "Gender:"))
     self.titleDrop.setItemText(0, _translate("MainWindow", "Dr."))
     self.titleDrop.setItemText(1, _translate("MainWindow", "Mr."))
     self.titleDrop.setItemText(2, _translate("MainWindow", "Mrs."))
     self.titleDrop.setItemText(3, _translate("MainWindow", "Ms."))
     self.firstName.setPlaceholderText(_translate("MainWindow", "First Name"))
     self.lastName.setPlaceholderText(_translate("MainWindow", "Last Name"))
     self.designationDrop.setItemText(0, _translate("MainWindow", "Asst. Professor"))
     self.designationDrop.setItemText(1, _translate("MainWindow", "Professor"))
     self.designationDrop.setItemText(2, _translate("MainWindow", "Acct. Professor"))
     self.mGender.setText(_translate("MainWindow", "M"))
     self.fGender.setText(_translate("MainWindow", "F"))
     self.resetBtn.setText(_translate("MainWindow", "Reset"))
     self.cancelBtn.setText(_translate("MainWindow", "Cancel"))
     self.addBtn.setText(_translate("MainWindow", "Add"))
     self.label_6.setText(_translate("MainWindow", "Teacher Id:"))
     helper.ensure_dir('Training/')
     s = os.listdir('Training')
     if len(s)<9:
         id = 'RJITCSEIT0'+str(len(s)+1)
     else:
         id = 'RJITCSEIT'+str(len(s)+1)
     self.label_7.setText(_translate("MainWindow", id ))

예제 #3

0

파일 보기

def create_dataset(teacher_id):
    faceDetect = cv2.CascadeClassifier(
        'Cascade/haarcascade_frontalface_default.xml')
    eye_cascade = cv2.CascadeClassifier('Cascade/haarcascade_eye.xml')
    cam = cv2.VideoCapture(1)
    sample_count = 0
    helper.ensure_dir('Training/')
    directory = 'Training/' + teacher_id + '/'
    helper.ensure_dir(directory)
    s = len(os.listdir(directory))
    while True:
        ret, img = cam.read()
        gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
        faces = faceDetect.detectMultiScale(gray, 1.3, 5)

        for x, y, w, h in faces:
            gray_face = cv2.resize((gray[y:y + h, x:x + w]), (110, 110))
            eyes = eye_cascade.detectMultiScale(gray_face)
            for ex, ey, ew, eh in eyes:
                cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
                sample_count += 1
                cv2.imwrite(
                    directory + teacher_id + '_' + str(sample_count + s) +
                    '.jpg', gray[y:y + h, x:x + w])
                cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
                cv2.waitKey(100)

        cv2.imshow('My Face', img)
        cv2.waitKey(1)
        if sample_count >= 20:
            break
    cam.release()
    cv2.destroyAllWindows()

예제 #4

0

파일 보기

def train():
    recognizer = cv2.createLBPHFaceRecognizer(2, 2, 7, 7, 15)
    path = 'Training'
    userIDs, faces = helper.get_faces_with_username(path)
    recognizer.train(faces, userIDs)
    directory = 'Recognizer'
    helper.ensure_dir(directory)
    recognizer.save('Recognizer/trainingData.yaml')

예제 #5

0

파일 보기

def get_lyrics_by_tracks(artist_tracks_id_object):
    artist_tracks_object = {}
    counter = 1
    musixmatch_regex = re.compile(r'\*.*\*\s*$') # this will delete **** This Lyrics is NOT... *** at the end of the string
    is_limit_reached = False

    helper.ensure_dir(OUTPUT_DIR_MUSIXMATCH_JSON)

    if VERBOSE:
        helper.log_highlight('Fetching lyrics of tracks')

    for artist_id, tracks in artist_tracks_id_object.items():
        artist_tracks = {}
        artist_tracks[artist_id] = []

        if VERBOSE:
            print 'Fetching tracks of artist ' + str(artist_id) + ' [' + str(counter) + ' of ' + str(len(artist_tracks_id_object)) + ']'

        if os.path.exists(OUTPUT_DIR_MUSIXMATCH_JSON + artist_id + '.json') and SKIP_EXISTING_LYRICS:
            if VERBOSE:
                print "    Tracks of artist already fetched: " + OUTPUT_DIR_MUSIXMATCH_JSON + str(artist_id) + '.json'
                counter += 1
            continue

        for index, track_id in enumerate(tracks, start = 1):
            response    = fetch_lyrics_by_track_id(track_id)
            header      = response['message']['header']
            status_code = header['status_code']

            if VERBOSE:
                print '    Fetching lyrics of track ' + str(track_id) + ' [' + str(index) + ' of ' + str(len(tracks)) + ']'

            if status_code is 200:
                lyrics = response['message']['body']['lyrics']['lyrics_body']

                lyrics_replaced = re.sub(r'\*.*\*\s*$', '', lyrics)

                artist_tracks[artist_id].append(lyrics_replaced)

                try:
                    artist_tracks_object[artist_id] += lyrics_replaced
                except:
                    artist_tracks_object[artist_id] = ''
                    artist_tracks_object[artist_id] += lyrics_replaced

            if status_code is 402:
                is_limit_reached = True

        counter += 1

        if not is_limit_reached:
            if VERBOSE:
                print '\n    Save JSON with lyrics\n'
            save_json(artist_tracks, OUTPUT_DIR_MUSIXMATCH_JSON + artist_id + '.json')

    return artist_tracks_object

예제 #6

0

파일 보기

파일: gui.py 프로젝트: Divyanshudb/Attendance_Minor_Project

 def add(self):
     id = self.label_7.text()
     fname = self.firstName.text()
     lname = self.lastName.text()
     title = self.titleDrop.currentText()
     design = self.designationDrop.currentText()
     gender = None
     if self.fGender.isChecked():
         gender = 'F'
     elif self.mGender.isChecked():
         gender = 'M'
     db.add_teacher(id,title,fname,lname,gender,design)
     helper.ensure_dir('Training/'+id+'/')

예제 #7

0

파일 보기

def main():
    args = parse_args()

    # input files
    train_file = args.data_dir + '/train.json'
    dev_file = args.data_dir + '/dev.json'
    test_file = args.data_dir + '/test.json'
    wv_file = args.wv_file
    wv_dim = args.wv_dim

    # output files
    helper.ensure_dir(args.vocab_dir)
    vocab_file = args.vocab_dir + '/vocab.pkl'
    emb_file = args.vocab_dir + '/embedding.npy'

    # load files
    print("loading files...")
    train_tokens = load_tokens(
        train_file)  # load sentence token with entity being padding?
    dev_tokens = load_tokens(dev_file)
    test_tokens = load_tokens(test_file)
    if args.lower:
        train_tokens, dev_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in\
                (train_tokens, dev_tokens, test_tokens)]

    # load glove
    print("loading glove...")
    glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim)
    print("{} words loaded from glove.".format(len(glove_vocab)))

    print("building vocab...")
    v = build_vocab(train_tokens, glove_vocab, args.min_freq)

    print("calculating oov...")
    datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens}
    for dname, d in datasets.items():
        total, oov = count_oov(d, v)
        print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total,
                                               oov * 100.0 / total))

    print("building embeddings...")
    embedding = vocab.build_embedding(wv_file, v, wv_dim)
    print("embedding size: {} x {}".format(*embedding.shape))

    print("dumping to files...")
    with open(vocab_file, 'wb') as outfile:
        pickle.dump(v, outfile)
    np.save(emb_file, embedding)
    print("all done.")

예제 #8

0

파일 보기

파일: main.py 프로젝트: strombom/ppo2

def build_env(args, env_name=EXP_NAME):
    env = gym.make("CartPole-v0")
    env = Monitor(env,
                  helper.ensure_dir(path.join(args.monitor_path, env_name)),
                  allow_early_resets=True)
    env = Reset(env)
    return env

예제 #9

0

파일 보기

def write_files(fdata, data_dir):
    """
    Expects a list of tuples, containing (case_id, case_data)
    Writes the {case_data} into {case_id}.txt
    All the files will be written into {data_dir} directory
    """

    helper.ensure_dir(data_dir)

    for case_id, case_data in fdata:

        if (case_id is None) or (case_data is None):
            continue

        f = open(os.path.join(data_dir, case_id + '.txt'), 'w')
        f.write(case_data.encode('utf8'))
        f.close()

    pass

예제 #10

0

파일 보기

파일: gui.py 프로젝트: hsagarthegr8/Attendance_Minor_Project

 def add(self, MainWindow):
     id = self.label_7.text()
     fname = self.firstName.text()
     lname = self.lastName.text()
     title = self.titleDrop.currentText()
     design = self.designationDrop.currentText()
     gender = None
     if self.fGender.isChecked():
         gender = 'F'
     elif self.mGender.isChecked():
         gender = 'M'
     result, comment = db.add_teacher(id, title, fname, lname, gender,
                                      design)
     if result:
         helper.ensure_dir('Training/' + id + '/')
         MainWindow.close()
         buttonReply = QtWidgets.QMessageBox.question(
             MainWindow, 'Teacher Added', comment, QtWidgets.QMessageBox.Ok,
             QtWidgets.QMessageBox.Ok)

예제 #11

0

파일 보기

파일: generate_case_data.py 프로젝트: adithyap/mlcs16

def write_files(fdata, data_dir):
    """
    Expects a list of tuples, containing (case_id, case_data)
    Writes the {case_data} into {case_id}.txt
    All the files will be written into {data_dir} directory
    """

    helper.ensure_dir(data_dir)

    for case_id, case_data in fdata:

        if (case_id is None) or (case_data is None):
            continue

        f = open(os.path.join(data_dir, case_id + '.txt'), 'w')
        f.write(case_data.encode('utf8'))
        f.close()

    pass

예제 #12

0

파일 보기

def get_html_by_tracks(artist_tracks_id_object):
    artist_tracks_object = {}
    counter = 1
    musixmatch_regex = re.compile(r'\*.*\*\s*$') # this will delete **** This Lyrics is NOT... *** at the end of the string

    if VERBOSE:
        helper.log_highlight('Fetching lyrics HTML of tracks')

    helper.ensure_dir(OUTPUT_DIR_MUSIXMATCH_HTML)

    for artist_id, tracks in artist_tracks_id_object.items():
        if VERBOSE:
            print 'Fetching tracks of artist ' + str(artist_id) + ' [' + str(counter) + ' of ' + str(len(artist_tracks_id_object)) + ']'


        for index, track_id in enumerate(tracks, start = 1):
            response    = fetch_html_lyrics_by_track_id(track_id)
            header      = response['message']['header']
            status_code = header['status_code']
            has_lyrics  = response['message']['body']['track']['has_lyrics']

            if VERBOSE:
                print '    Fetching lyrics of track ' + str(track_id) + ' [' + str(index) + ' of ' + str(len(tracks)) + ']'

            if status_code is 200 and int(has_lyrics) > 0:
                track_url = response['message']['body']['track']['track_share_url']
                filename  = OUTPUT_DIR_MUSIXMATCH_HTML + str(artist_id) + '_' + str(track_id) + '.html'

                try:
                    if VERBOSE:
                        print '    Storing and retrieving data from ' + track_url

                    content = urllib.urlopen(track_url).read()

                    with open(filename, 'w') as f:
                        f.write(content)

                except IOError:                # return empty content in case some IO / socket error occurred
                    if VERBOSE:
                        print '    Cannot retrieve data from ' + track_url

예제 #13

0

파일 보기

파일: clean_data.py 프로젝트: amallia/nlp-attribute-extractor

def generate_data(f, reviews):

    count = 1

    dir_name = helper.get_name_without_extension(os.path.basename(f))
    helper.ensure_dir(dir_name)

    for review in reviews:

        # Without labels
        target_path = os.path.join(dir_name, 'no_label_' + str(count) + '.txt')
        helper.save_list_to_file(target_path, [x[2] for x in review.samples])

        # With labels
        target_path = os.path.join(dir_name, str(count) + '.txt')
        data = [
            ','.join(x[0]) + ' | ' + ','.join(x[1]) + ' | ' + x[2]
            for x in review.samples
        ]
        helper.save_list_to_file(target_path, data)

        count += 1

예제 #14

0

파일 보기

파일: main.py 프로젝트: JPeer264/recommender-systems

def save_lfmb_c1ku_combined_file(c1ku_file, lfmb1_file, output_file,
                                 header_string):
    helper.log_highlight('save ' + output_file)

    LFM1b_file = mf.read_txt(lfmb1_file)
    sorted_string = header_string + "\n"

    with open(c1ku_file, 'r') as f:
        reader = csv.reader(f, delimiter='\t')  # create reader
        headers = reader.next()  # skip header

        for index, row in enumerate(reader, start=1):
            the_id = row[0]

            sorted_string += LFM1b_file[the_id] + "\n"

    helper.ensure_dir(OUTPUT_DIR)

    text_file = open(output_file, 'w')

    text_file.write(sorted_string)
    text_file.close()

예제 #15

0

파일 보기

파일: fix_dataset.py 프로젝트: andompesta/bank_credit_scoring

def fix_station_data(stations, ref_station_data):

    for station_idx, station_id in enumerate(stations.index):
        station_data = read_station_data(station_id)

        print(station_idx, station_id)
        if path.isfile(
                path.join(BASE_DIR, "pems", "fix",
                          "{}.csv".format(station_id))):
            continue

        if station_data["Flow (Veh/5 Minutes)"].dtype == 'object':
            station_data["Flow (Veh/5 Minutes)"] = station_data[
                "Flow (Veh/5 Minutes)"].map(
                    lambda x: float(x.replace(",", "")))

        if "Speed (mph)" not in station_data.columns:
            station_data["Speed (mph)"] = pd.Series(np.zeros(
                station_data.index.size),
                                                    index=station_data.index)
            station_data = station_data[[
                "Flow (Veh/5 Minutes)", "Speed (mph)", "# Lane Points",
                "% Observed"
            ]]

        if station_data.shape[0] != ref_station_data.shape[0]:
            station_data = station_data.reindex_like(ref_station_data,
                                                     method='ffill')

        if pd.isnull(station_data).any().any():
            print(station_id)
            raise Exception("bad stations")

        station_data = resample_dataframe(station_data)

        station_data = station_data.apply(pd.to_numeric)
        station_data.to_csv(ensure_dir(
            path.join(BASE_DIR, "pems", "fix", "{}.csv".format(station_id))),
                            date_format="%Y-%m-%d %H:%M:%S")

예제 #16

0

파일 보기

파일: WebSimilarity.py 프로젝트: JPeer264/recommender-systems

def generate_wikipedia_AAM():
        ps = PorterStemmer()
        html_contents = {}
        # dictionary to hold document frequency of each term in corpus
        terms_df = {}
        # list of all terms
        term_list = []

        # read artist names from file
        artists = Wikipedia_Fetcher.read_file(Wikipedia_Fetcher.ARTISTS_FILE)   # using functions and parameters defined in o1_Wikipedia_Fetcher.py

        helper.ensure_dir(WIKIPEDIA_OUTPUT)

        # for all artists
        for i in range(0, len(artists)):
            # construct file name to fetched HTML page for current artist, depending on parameter settings in Wikipedia_Fetcher.py
            if Wikipedia_Fetcher.USE_INDEX_IN_OUTPUT_FILE:
                html_fn = Wikipedia_Fetcher.OUTPUT_DIRECTORY + "/" + str(i) + ".html"     # target file name
            elif not Wikipedia_Fetcher.USE_INDEX_IN_OUTPUT_FILE:
                html_fn = Wikipedia_Fetcher.OUTPUT_DIRECTORY + "/" + urllib.quote(artists[i]) + ".html"     # target file name

            # Load fetched HTML content if target file exists
            if os.path.exists(html_fn):
                # Read entire file
                html_content = open(html_fn, 'r').read()

                # Next we perform some text processing:
                # Strip content off HTML tags
                content_tags_removed = remove_html_markup(html_content)
                # remove numbers
                content_no_numbers = re.sub(r'[0-9]+', ' ', content_tags_removed)
                # Perform case-folding, i.e., convert to lower case
                content_casefolded = content_no_numbers.lower()
                # remove words with wiki in it
                content_no_specific_words = re.sub(r'[\w]*wiki|article|pedia|privacy|policy[\w]*', ' ', content_casefolded)
                # Tokenize stripped content at white space characters
                tokens = content_no_specific_words.split()
                # Remove all tokens containing non-alphanumeric characters; using a simple lambda function (i.e., anonymous function, can be used as parameter to other function)
                tokens_filtered = filter(lambda t: t.isalnum(), tokens)
                # Remove words in the stop word list
                tokens_filtered_stopped = filter(lambda t: t not in STOP_WORDS, tokens_filtered)

                tokens_stemmed = []
                # stemm words
                for w in tokens_filtered_stopped:
                    tokens_stemmed.append(ps.stem(w))

                # Store remaining tokens of current artist in dictionary for further processing
                if len(tokens_stemmed) > 0:
                    html_contents[i] = tokens_filtered_stopped

                print "File " + html_fn + " --- total tokens: " + str(len(tokens)) + "; after filtering and stopping: " + str(len(tokens_filtered_stopped))
            else:           # Inform user if target file does not exist
                print "Target file " + html_fn + " does not exist!"
                html_contents[i] = ''

        # Start computing term weights, in particular, document frequencies and term frequencies.

        # Iterate over all (key, value) tuples from dictionary just created to determine document frequency (DF) of all terms
        for aid, terms in html_contents.items():
            # convert list of terms to set of terms ("uniquify" words for each artist/document)
            for t in set(terms):                         # and iterate over all terms in this set
                # update number of artists/documents in which current term t occurs
                if t not in terms_df:
                    terms_df[t] = 1
                else:
                    terms_df[t] += 1

        # remove all values which are one
        #terms_df = dict((k, v) for k, v in terms_df.iteritems() if v != 1)

        # Compute number of artists/documents and terms
        no_artists = len(html_contents.items())
        no_terms = len(terms_df)

        print "Number of artists in corpus: " + str(no_artists)
        print "Number of terms in corpus: " + str(no_terms)

        # You may want (or need) to perform some kind of dimensionality reduction here, e.g., filtering all terms
        # with a very small document frequency.
        # ...

        # Dictionary is unordered, so we store all terms in a list to fix their order, before computing the TF-IDF matrix
        for t in terms_df.keys():
            term_list.append(t)

        # Create IDF vector using logarithmic IDF formulation
        idf = np.zeros(no_terms, dtype=np.float32)
        for i in range(0, no_terms):
            idf[i] = np.log(no_artists / terms_df[term_list[i]])
    #        print term_list[i] + ": " + str(idf[i])

        # Initialize matrix to hold term frequencies (and eventually TF-IDF weights) for all artists for which we fetched HTML content
        tfidf = np.zeros(shape=(no_artists, no_terms), dtype=np.float32)

        # Iterate over all (artist, terms) tuples to determine all term frequencies TF_{artist,term}
        terms_index_lookup = {}         # lookup table for indices (for higher efficiency)
        for a_idx, terms in html_contents.items():
            print "Computing term weights for artist " + str(a_idx)
            # You may want (or need) to make the following more efficient.
            for t in terms:                     # iterate over all terms of current artist
                if t in terms_index_lookup:
                    t_idx = terms_index_lookup[t]
                else:
                    t_idx = term_list.index(t)      # get index of term t in (ordered) list of terms
                    terms_index_lookup[t] = t_idx
                tfidf[a_idx, t_idx] += 1        # increase TF value for every encounter of a term t within a document of the current artist

        # Replace TF values in tfidf by TF-IDF values:
        # copy and reshape IDF vector and point-wise multiply it with the TF values
        tfidf = np.log1p(tfidf) * np.tile(idf, no_artists).reshape(no_artists, no_terms)

        # Storing TF-IDF weights and term list
        print "Saving TF-IDF matrix to " + WIKIPEDIA_TFIDFS + "."
        np.savetxt(WIKIPEDIA_TFIDFS, tfidf, fmt='%0.6f', delimiter='\t', newline='\n')

        print "Saving term list to " + WIKIPEDIA_TERMS + "."
        with open(WIKIPEDIA_TERMS, 'w') as f:
            f.write("terms\n")

            for t in term_list:
                f.write(t + "\n")

        # Computing cosine similarities and store them
    #    print "Computing cosine similarities between artists."
        # Initialize similarity matrix
        sims = np.zeros(shape=(no_artists, no_artists), dtype=np.float32)
        # Compute pairwise similarities between artists
        for i in range(0, no_artists):
            print "Computing similarities for artist " + str(i)
            for j in range(i, no_artists):
                cossim = 1.0 - scidist.cosine(tfidf[i], tfidf[j])

                # If either TF-IDF vector (of i or j) only contains zeros, cosine similarity is not defined (NaN: not a number).
                # In this case, similarity between i and j is set to zero (or left at zero, in our case).
                if not np.isnan(cossim):
                    sims[i,j] = cossim
                    sims[j,i] = cossim

        print "Saving cosine similarities to " + WIKIPEDIA_AAM + "."
        np.savetxt(WIKIPEDIA_AAM, sims, fmt='%0.6f', delimiter='\t', newline='\n')

        # Compute number of artists/documents and terms
        no_artists = len(html_contents.items())
        no_terms = len(terms_df)

        print "Number of artists in corpus: " + str(no_artists)
        print "Number of terms in corpus: " + str(no_terms)

예제 #17

0

파일 보기

파일: download_hourly_data.py 프로젝트: andompesta/awsome_crypto_arbitrage

]
DATE_START = DT.datetime.strptime(
    "{} 08:00:00".format(DT.datetime.now().strftime('%Y-%m-%d')),
    '%Y-%m-%d %H:%M:%S')
DATE_END = DATE_START - DT.timedelta(days=7)
LIMIT = 2000


def get_data(container, time):
    df, time_to = fetch_data_by_exchange(FSYM,
                                         TSYM,
                                         market,
                                         time,
                                         time_frame="minute")
    container.append(df)
    return df.shape[0], time_to


for market in MARKETS:
    print("{}".format(market), end="")
    dfs = []

    num_row, time = get_data(dfs, T.mktime(DATE_START.timetuple()))
    while num_row > LIMIT:
        num_row, time = get_data(dfs, time)

    data = pd.concat(dfs).sort_index().drop_duplicates("time")
    data.to_csv(ensure_dir(path_join("./data",
                                     "{}_minute.csv".format(market))))
    print("\tdownloaded")

예제 #18

0

파일 보기

    no_users = UAM.shape[0]
    no_artists = UAM.shape[1]
    # np.tile: take sum_pc_user no_artists times (results in an array of length no_artists*no_users)
    # np.reshape: reshape the array to a matrix
    # np.transpose: transpose the reshaped matrix
    artist_sum_copy = np.tile(sum_pc_user,
                              no_artists).reshape(no_artists,
                                                  no_users).transpose()
    # Perform sum-to-1 normalization
    UAM = UAM / artist_sum_copy

    # Inform user
    print "UAM created. Users: " + str(UAM.shape[0]) + ", Artists: " + str(
        UAM.shape[1])

    helper.ensure_dir(OUTPUT_DIR)

    # Write everything to text file (artist names, user names, UAM)
    # Write artists to text file
    with open(ARTISTS_FILE, 'w') as outfile:  # "a" to append
        outfile.write('artist\n')
        for key in artists.keys():  # for all artists listened to by any user
            outfile.write(key + "\n")

    # Write users to text file
    with open(USERS_FILE, 'w') as outfile:
        outfile.write('user\n')
        for key in users.keys():  # for all users
            outfile.write(key + "\n")

    # Write UAM

예제 #19

0

파일 보기

파일: run_recommender.py 프로젝트: JPeer264/recommender-systems

def run_recommender(run_function, run_method, neighbors=[1, 2, 5, 10, 20, 50], recommender_artists=[1, 3, 5, 7, 10, 20, 30, 50, 100, 200]):
    """
    runs automatically the run function, this funciton must be declared in the parameters
    it also saves automatically a json string with the parameters - the file name is as follows:
    K(K_number)_R(Recommended_artists).json

    :param run_function: the run fuction, from the single recommender
    :param run_method: the string which describes the current recommender
    :param neighbors: a list of different neighbors
    :param recommender_artists: a list of different artists to recommend
    """
    # for threading
    global NUM_THREADS, THREAD_STARTED, LOCK

    LOCK.acquire()
    NUM_THREADS += 1
    THREAD_STARTED = True
    LOCK.release()
    # for threading

    k_sorted       = {}
    r_sorted       = {}
    data_to_append = {}
    all_files      = {}
    output_filedir = OUTPUT_DIR + run_method + '/'
    all_files_path = output_filedir + 'all.json'

    helper.ensure_dir(output_filedir + 'recommended/')

    for neighbor in neighbors:
        k_sorted['K' + str(neighbor)] = []

        for recommender_artist in recommender_artists:
            k_sorted['R' + str(recommender_artist)] = []
            file_path       = output_filedir + 'K' + str(neighbor) + '_R' + str(recommender_artist) + '.json'
            file_path_reco  = output_filedir + 'recommended/' + 'K' + str(neighbor) + '_R' + str(recommender_artist) + '.json'
            data_to_append  = {'neighbors': neighbor, 'recommended_artists': recommender_artist}
            data            = run_function(neighbor, recommender_artist)
            recommended     = data['recommended']
            formated_recommended = {}

            # delete this
            # 1. not valid json
            # 2. not necessary for the specific files
            del data['recommended']

            data_to_append.update(data)

            if type(recommended) is not bool:
                for key, value in recommended.iteritems():
                    # convert everything to strings
                    # due to otherwise it is not a valid json
                    formated_recommended[key] = {}

                    if len(value) == 0:
                        continue

                    for kf, fold_recommended in value.iteritems():
                        formated_recommended[key][kf] = {}
                        formated_recommended[key][kf]['recommended'] = {}
                        formated_recommended[key][kf]['order'] = []

                        for artist, ranking in fold_recommended.iteritems():
                            formated_recommended[key][kf]['recommended'][str(artist)] = str(ranking)
                            formated_recommended[key][kf]['order'].append(artist)

                # write json file for hybrids
                content = json.dumps(formated_recommended, indent=4, sort_keys=True)
                f = open(file_path_reco, 'w')
                f.write(content)
                f.close()

            # write json file for csv
            content = json.dumps(data_to_append, indent=4, sort_keys=True)
            f = open(file_path, 'w')
            f.write(content)
            f.close()

    # for threading
    LOCK.acquire()
    NUM_THREADS -= 1
    LOCK.release()

예제 #20

0

파일 보기

파일: train.py 프로젝트: andompesta/rl-nlp-goals

def work(rank, args, master_net, cc, optimizer=None):
    torch.manual_seed(args.seed + rank)

    summary_file = path_join(args.model_path, EXP_NAME + "_{}".format(rank))
    summary = cc.create_experiment(helper.ensure_dir(summary_file))
    summary.to_zip(summary_file)

    exp_buff = helper.ExperienceBuffer()
    episodes = master_net.episodes
    episode_deliveries = []
    episode_lengths = []
    # episode_mean_values = []

    # Create the local copy of the network
    env = gameEnv(args.partial, args.env_size, args.action_space)
    local_net = DFP_Network(
        (args.env_size**2) *
        3,  # observation_size = (args.env_size*args.env_size)*3 = battel_ground*colors
        num_offset=len(args.offset),
        a_size=args.action_space,
        num_measurements=args.num_measurements)
    assert args.num_measurements == len(env.measurements)

    if optimizer is None:
        optimizer = optim.Adam(master_net.parameters(), lr=args.learning_rate)

    print("Starting work on worker-{}".format(rank))

    while not master_net.should_stop():
        local_net.load_state_dict(master_net.state_dict(
        ))  # Copy parameters from global to local network
        episode_buffer = []
        episode_frames = []
        done = False
        step = 0
        temp = 0.25  # How spread out we want our action distribution to be

        observation, o_big, measurements, delivery_pos, drone_pos = env.reset()
        the_measurements = measurements  # measuremeents [number delivery, battery life]
        while not done:

            # Here is where our goal-switching takes place
            # When the battery charge is below 0.3, we set the goal to optimize battery
            # When the charge is above that value we set the goal to optimize deliveries
            if measurements[1] <= .3:
                goal = np.array([[0., 1.]])
            else:
                goal = np.array([[1., 0.]
                                 ])  # goal [go for delivery, go for battery]

            action_dist = local_net.forward(np.expand_dims(observation, 0),
                                            np.expand_dims(measurements, 0),
                                            goal, temp)

            b = np.squeeze(goal, axis=0) * np.squeeze(action_dist.data.numpy(),
                                                      axis=0).T
            c = np.sum(b, axis=1)
            c /= c.sum()

            # Choose greedy action
            action = np.random.choice(c, p=c)
            action = np.argmax(c == action)

            observation_new, o_new_big, measurements_new, delivery_pos_new, drone_pos_new, done = env.step(
                action)
            episode_buffer.append([
                observation, action,
                np.array(measurements), goal,
                np.zeros(len(args.offset))
            ])

            if rank == 0 and master_net.episodes % 150 == 0:
                episode_frames.append(
                    helper.set_image_gridworld(o_new_big, measurements_new,
                                               step + 1, delivery_pos_new,
                                               drone_pos_new))

            observation = np.copy(observation_new)
            measurements = measurements_new[:]
            delivery_pos = delivery_pos_new[:]
            drone_pos = drone_pos_new
            step += 1

            # End the episode after 100 steps
            if step > 100:
                done = True

        episode_deliveries.append(measurements[0])
        episode_lengths.append(step)

        # Update the network using the experience buffer at the end of the episode.
        if args.train:
            loss, entropy = train(episode_buffer,
                                  exp_buff,
                                  local_net=local_net,
                                  master_net=master_net,
                                  action_space=args.action_space,
                                  offsets=args.offset,
                                  optimizer=optimizer,
                                  batch_size=args.batch_size,
                                  max_grad_norm=args.max_grad_norm)

        # Periodically save gifs of episodes, model parameters, and summary statistics.
        if episodes % 50 == 0 and episodes != 0:
            if master_net.episodes % 2000 == 0 and rank == 0 and train:
                model_file = path_join(args.model_path,
                                       'model-{}.cptk'.format(episodes))
                torch.save(master_net.state_dict(),
                           helper.ensure_dir(model_file))
                print("Saved Model")

            if rank == 0 and master_net.episodes % 150 == 0:
                time_per_step = 0.25
                images = np.array(episode_frames)
                image_file = path_join(args.gif_path +
                                       '/image-{}.gif'.format(episodes))
                imageio.mimsave(helper.ensure_dir(image_file),
                                images,
                                duration=time_per_step)

            mean_deliveries = np.mean(episode_deliveries[-50:])
            mean_length = np.mean(episode_lengths[-50:])
            # mean_value = np.mean(episode_mean_values[-50:])

            summary.add_scalar_value('Performance/Deliveries_{}'.format(rank),
                                     float(mean_deliveries))
            summary.add_scalar_value('Performance/Length_{}'.format(rank),
                                     float(mean_length))
            # summary.add_scalar_value('Performance/Mean-{}'.format(rank), float(mean_value))
            summary.add_scalar_value('Check/episode_{}'.format(rank), episodes)
            summary.add_scalar_value('Check/master_episode_{}'.format(rank),
                                     master_net.episodes)

            if args.train:
                summary.add_scalar_value('Losses/Loss_{}'.format(rank),
                                         float(loss.data.numpy()))
                summary.add_scalar_value('Losses/Entory_{}'.format(rank),
                                         float(entropy.data.numpy()))
            summary.to_zip(summary_file)
        episodes += 1
        master_net.episodes += 1

예제 #21

0

파일 보기

파일: plot_multi_scenario.py 프로젝트: mpeuster/estate

def plot(experiment, output_dir="evaluation/multi_scenario", input_dir="results/"):
    # setup directories for this plot
    input_dir = os.path.join(experiment, input_dir)
    output_dir = os.path.join(experiment, output_dir)
    ensure_dir(output_dir, rm=True)
    print input_dir
    print output_dir

    # load data
    ed = data.ExperimentData(path=input_dir)
    ed.normalize_times()
    df = ed.get_combined_df()

    cdelays = sorted(df["controldelay"].drop_duplicates().tolist())
    print cdelays

    lambdas = sorted(df["srclambda"].drop_duplicates().tolist())
    print lambdas

    middleboxes = sorted(df["numbermb"].drop_duplicates().tolist())
    # middleboxes.remove(16)
    print middleboxes

    dummystatesizes = sorted(df["dummystatesize"].drop_duplicates().tolist())
    print dummystatesizes

    """
    Plots:
    xaxis = numbermb
    yaxis = pps, request times, global pcount
    layout: one plot line per backend
    """
    for delay in cdelays[:2]:
        for lmb in lambdas:
            for dss in dummystatesizes:
                multi_scenario_plot(
                    output_dir,
                    ed,
                    xfield="numbermb",
                    yfield=["pps_global", "pps_local"],
                    destinction_field="backend",
                    rowfilter={
                        "controldelay": delay,
                        "srclambda": lmb,
                        "dummystatesize": dss},
                    xname="number of replicated VNF instances",
                    yname="avg. processed pkt/s",
                    name_pre="",
                    name_post="_d%03d_l%03d_dss%08d" % (delay, lmb*100, dss)
                    )
                multi_scenario_plot(
                    output_dir,
                    ed,
                    xfield="numbermb",
                    yfield=["pps_global"],
                    destinction_field="backend",
                    rowfilter={
                        "controldelay": delay,
                        "srclambda": lmb,
                        "dummystatesize": dss},
                    xname="number of replicated VNF instances",
                    yname="avg. processed pkt/s",
                    name_pre="",
                    name_post="_d%03d_l%03d_dss%08d" % (delay, lmb*100, dss)
                    )
                multi_scenario_plot(
                    output_dir,
                    ed,
                    xfield="numbermb",
                    yfield=["t_request_global", "t_request_local"],
                    destinction_field="backend",
                    rowfilter={
                        "controldelay": delay,
                        "srclambda": lmb,
                        "dummystatesize": dss},
                    xname="number of replicated VNF instances",
                    yname="avg. state request delay [s]",
                    name_pre="",
                    name_post="_d%03d_l%03d_dss%08d" % (delay, lmb*100, dss)
                    )
                multi_scenario_plot(
                    output_dir,
                    ed,
                    xfield="numbermb",
                    yfield=["t_request_global"],
                    destinction_field="backend",
                    rowfilter={
                        "controldelay": delay,
                        "srclambda": lmb,
                        "dummystatesize": dss},
                    xname="number of replicated VNF instances",
                    yname="avg. state request delay [s]",
                    name_pre="",
                    name_post="_d%03d_l%03d_dss%08d" % (delay, lmb*100, dss),
                    ymax=0.9
                    )
                multi_scenario_plot(
                    output_dir,
                    ed,
                    xfield="numbermb",
                    yfield=["pcount_global", "pcount_local"],
                    destinction_field="backend",
                    rowfilter={
                        "controldelay": delay,
                        "srclambda": lmb,
                        "dummystatesize": dss},
                    xname="number of replicated VNF instances",
                    yname="number of processed packets",
                    name_pre="",
                    name_post="_d%03d_l%03d_dss%08d" % (delay, lmb*100, dss)
                    )

    """
    Plots:
    xaxis = controldelay
    yaxis = pps, request times, global pcount
    layout: one plot line per backend
    """
    for nmb in middleboxes:
        for lmb in lambdas:
            for dss in dummystatesizes[:2]:
                multi_scenario_plot(
                    output_dir,
                    ed,
                    xfield="controldelay",
                    yfield=["pps_global", "pps_local"],
                    destinction_field="backend",
                    rowfilter={
                        "numbermb": nmb,
                        "srclambda": lmb,
                        "dummystatesize": dss},
                    xname="control plane latency [ms]",
                    yname="avg. processed pkt/s",
                    name_pre="",
                    name_post="_nmb%03d_l%03d_dss%08d" % (nmb, lmb*100, dss)
                    )
                multi_scenario_plot(
                    output_dir,
                    ed,
                    xfield="controldelay",
                    yfield=["pps_global"],
                    destinction_field="backend",
                    rowfilter={
                        "numbermb": nmb,
                        "srclambda": lmb,
                        "dummystatesize": dss},
                    xname="control plane latency [ms]",
                    yname="avg. processed pkt/s",
                    name_pre="",
                    name_post="_nmb%03d_l%03d_dss%08d" % (nmb, lmb*100, dss)
                    )
                multi_scenario_plot(
                    output_dir,
                    ed,
                    xfield="controldelay",
                    yfield=["t_request_global", "t_request_local"],
                    destinction_field="backend",
                    rowfilter={
                        "numbermb": nmb,
                        "srclambda": lmb,
                        "dummystatesize": dss},
                    xname="control plane latency [ms]",
                    yname="avg. state request delay [s]",
                    name_pre="",
                    name_post="_nmb%03d_l%03d_dss%08d" % (nmb, lmb*100, dss)
                    )
                multi_scenario_plot(
                    output_dir,
                    ed,
                    xfield="controldelay",
                    yfield=["t_request_global"],
                    destinction_field="backend",
                    rowfilter={
                        "numbermb": nmb,
                        "srclambda": lmb,
                        "dummystatesize": dss},
                    xname="control plane latency [ms]",
                    yname="avg. state request delay [s]",
                    name_pre="",
                    name_post="_nmb%03d_l%03d_dss%08d" % (nmb, lmb*100, dss)
                    )
                multi_scenario_plot(
                    output_dir,
                    ed,
                    xfield="controldelay",
                    yfield=["pcount_global"],
                    destinction_field="backend",
                    rowfilter={
                        "numbermb": nmb,
                        "srclambda": lmb,
                        "dummystatesize": dss},
                    xname="control plane latency [ms]",
                    yname="number of processed packets",
                    name_pre="",
                    name_post="_nmb%03d_l%03d_dss%08d" % (nmb, lmb*100, dss)
                    )

    """
    Plots:
    xaxis = dummystatesize
    yaxis = pps, request times, global pcount
    layout: one plot line per backend
    """
    for nmb in middleboxes:
        for lmb in lambdas:
            for delay in cdelays[:2]:
                multi_scenario_plot(
                    output_dir,
                    ed,
                    xfield="dummystatesize",
                    yfield=["pps_global"],
                    destinction_field="backend",
                    rowfilter={
                        "numbermb": nmb,
                        "srclambda": lmb,
                        "controldelay": delay},
                    xname="state size [byte]",
                    yname="avg. processed pkt/s",
                    name_pre="",
                    name_post="_nmb%03d_l%03d_d%03d" % (nmb, lmb*100, delay),
                    xlogscale=True
                    )
                multi_scenario_plot(
                    output_dir,
                    ed,
                    xfield="dummystatesize",
                    yfield=["t_request_global", "t_request_local"],
                    destinction_field="backend",
                    rowfilter={
                        "numbermb": nmb,
                        "srclambda": lmb,
                        "controldelay": delay},
                    xname="state item size [byte]",
                    yname="avg. processed request delay [s]",
                    name_pre="",
                    name_post="_nmb%03d_l%03d_d%03d" % (nmb, lmb*100, delay),
                    xlogscale=True
                    )
                multi_scenario_plot(
                    output_dir,
                    ed,
                    xfield="dummystatesize",
                    yfield=["t_request_global"],
                    destinction_field="backend",
                    rowfilter={
                        "numbermb": nmb,
                        "srclambda": lmb,
                        "controldelay": delay},
                    xname="state item size [byte]",
                    yname="avg. state request delay [s]",
                    name_pre="",
                    name_post="_nmb%03d_l%03d_d%03d" % (nmb, lmb*100, delay),
                    xlogscale=True
                    )

예제 #22

0

파일 보기

def process_dir(data_dir, MIN_N_GRAM, MAX_N_GRAM, b_verbose=False, b_size=None):
    """
    Processes a directory containing a set of case documents and generates n-grams.
    The n-grams thus generated shall be stored in {data_dir}/n-grams/
    """

    target_dir = os.path.join(data_dir, 'n_grams')

    # Make sure the target directory exists
    helper.ensure_dir(target_dir)

    # Get the case file list
    case_files = helper.get_files(data_dir)

    if b_size is not None:
        case_files = case_files[:b_size]

    total_count = len(case_files)
    progress = 0

    for case_file in case_files:

        # Compute the path to save the file
        target_file_name = os.path.basename(case_file)
        target_path = os.path.join(target_dir, target_file_name)

        # Read the case data from the string
        case_data = helper.read_file_to_string(case_file)

        valid_n_grams = {}

        # Go over every sentence in the document
        for sentence in get_sentences(case_data):

            pos_tuples = nltk.pos_tag(nltk.word_tokenize(sentence))

            # Update the grammar if required and get the POS tags
            pos_tags = get_pos_tags(pos_tuples)

            # Generate N-Grams of tags
            n_grams = []
            for n in range(MIN_N_GRAM, MAX_N_GRAM + 1):
                n_grams.extend([list(grams) for grams in ngrams(range(len(pos_tuples)), n)])

            # Get only the n-grams that match the defined grammar
            for i in range(len(n_grams)):

                # Generate n-gram list and check validity
                if parse([pos_tags[j] for j in n_grams[i]]):

                    # Append words to overall list
                    elements = ' '.join([pos_tuples[k][0] for k in n_grams[i]])

                    if elements in valid_n_grams:
                        valid_n_grams[elements] += 1
                    else:
                        valid_n_grams[elements] = 1

        # Save n-grams to file
        helper.save_dict_to_file(target_path, valid_n_grams)

        progress += 1

        if b_verbose:
            print(progress / (0.01 * total_count), ' % Complete')

    return target_dir

예제 #23

0

파일 보기

파일: b_line_main.py 프로젝트: andompesta/bank_credit_scoring

            if i_iter % args.eval_step == 0:
                iter_eval, saved_weights = eval(eval_dataloader, input_embeddings, target_embeddings, neighbor_embeddings, edge_types, mask_neighbor, device)
                eval_loss.append(iter_eval)
                vis.line(
                    Y=np.array(eval_loss),
                    X=np.array(range(0, i_iter + 1, args.eval_step)),
                    opts=dict(legend=["RMSE"],
                              title=model.name + " eval loos",
                              showlegend=True),
                    win="win:eval-{}".format(EXP_NAME))

                # print("dump example")
                # torch.save(saved_weights, ensure_dir(path.join(path.join("..", "data", args.data_dir), model.name, "saved_eval_iter_{}_drop_{}.pt".format(int(i_iter/args.eval_step), args.drop_prob))))
                # print("dump done")

                if best_model > iter_eval:
                    print("save best model")
                    best_model = iter_eval
                    torch.save(model, path.join(path.join("..", "data", args.data_dir), "{}.pt".format(model.name)))


        # test performance
        model = torch.load(path.join(path.join("..", "data", args.data_dir), "{}.pt".format(model.name)))

        test = setup_model(model, args.eval_batch_size, args, is_training=False)
        iter_test, saved_weights = test(test_dataloader, input_embeddings, target_embeddings, neighbor_embeddings, edge_types, mask_neighbor, device)
        print("test RMSE: {}".format(iter_test))
        torch.save(saved_weights, ensure_dir(
            path.join(path.join("..", "data", args.data_dir), model.name, "saved_test_drop_{}.pt".format(args.drop_prob))))
        test_rmse.append(iter_test)
    print("execution_mean: {}".format(np.mean(test_rmse)))

예제 #24

0

파일 보기

파일: statistics.py 프로젝트: JPeer264/recommender-systems

    stats['len_limited'] = len_limited
    stats['best_five'] = terms_df[:5]
    stats['all'] = terms_df
    stats['found_artists'] = found_artists

    return stats


# /count_wiki_terms

if __name__ == '__main__':
    loop_me = {}
    #loop_me['wiki'] = count_wiki_terms()
    loop_me['mm'] = count_mm_terms()

    helper.ensure_dir(OUTPUT)

    for key, terms in loop_me.iteritems():
        filename = 'novalue.json'

        if key == 'wiki':
            filename = 'wiki_term_stats.json'

        elif key == 'mm':
            filename = 'mm_term_stats.json'

        content = json.dumps(terms, indent=4, sort_keys=True)
        json_file = open(OUTPUT + 'mm_term_stats.json', 'w')

        json_file.write(content)
        json_file.close()

예제 #25

0

파일 보기

파일: download_dataset.py 프로젝트: andompesta/bank_credit_scoring

def download_traffic(stations_id,
                     auth_data,
                     auth_base='http://pems.dot.ca.gov/',
                     time_interval=[(1522627200, 1523059200),
                                    (1523232000, 1523664000),
                                    (1523836800, 1524268800),
                                    (1524441600, 1524873600)]):
    base_url = "http://pems.dot.ca.gov/?report_form=1&dnode=VDS&content=loops&tab=det_timeseries&export=text&station_id={}&s_time_id={}&e_time_id={}&tod=all&tod_from=0&tod_to=0&dow_1=on&dow_2=on&dow_3=on&dow_4=on&dow_5=on&q=flow&q2=speed&gn=5min&agg=on"
    with session() as c:
        c.post(auth_base, data=auth_data)

        for i, station_id in enumerate(stations_id):
            ts = 10  # Default time to sleep
            print("Iteration: {}".format(i))
            print('initial time to sleep {}'.format(ts))
            for j, (start_time, end_time) in enumerate(time_interval):
                url = base_url.format(station_id, start_time, end_time)
                ts_small = 2  # small sleep interval
                while True:
                    try:  # Download with 10-second sleep time breaks
                        print('try to download file: {}-{}'.format(
                            station_id, j))
                        print('time to sleep {}'.format(ts_small))
                        # Make the request and download attached file
                        r = c.get(url)
                        if r.status_code == 200:
                            with open(
                                    ensure_dir(
                                        path.join(BASE_DIR,
                                                  "{}".format(station_id),
                                                  "part-{}.csv".format(j))),
                                    "w") as file:
                                file.write(r.text)
                        else:
                            raise ConnectionError("Data not obtained")
                        # save file
                        time.sleep(
                            np.random.random_integers(ts_small,
                                                      int(1.2 * ts_small)))
                    except ConnectionError:
                        print('ConnectionError')
                        ts_small = ts_small * 2
                        time.sleep(ts)  # Sleep and login again
                        c.post(auth_base, data=auth_data)
                        continue
                    break
            # sleep for a longer interval
            dt = [
                pd.read_csv(path.join(BASE_DIR, "{}".format(station_id),
                                      "part-{}.csv".format(i)),
                            sep="\t") for i in range(len(time_interval))
            ]
            dt = pd.concat(dt, axis=0)
            dt["5 Minutes"] = pd.to_datetime(dt["5 Minutes"],
                                             format="%m/%d/%Y %H:%M")
            dt = dt.set_index("5 Minutes")
            dt.to_csv(
                ensure_dir(
                    path.join(BASE_DIR, "stations",
                              "{}.csv".format(station_id))))
            time.sleep(np.random.random_integers(ts, int(1.2 * ts)))

예제 #26

0

파일 보기

파일: dataset_creation.py 프로젝트: andompesta/bank_credit_scoring

        _ids.remove(s_idx)

    eval_dataset = random.sample(_ids, e_t_size)
    for s_idx in eval_dataset:
        _ids.remove(s_idx)

    return _ids, eval_dataset, test_dataset


if __name__ == "__main__":
    input_embeddings, target_embeddings, neighbor_embeddings, edge_type, mask_neigh, prefix = generate_triangular_embedding(
        (12000, 10), 4)

    torch.save(
        input_embeddings,
        ensure_dir(path.join(BASE_DIR, prefix + "_input_embeddings.pt")))
    torch.save(
        target_embeddings,
        ensure_dir(path.join(BASE_DIR, prefix + "_target_embeddings.pt")))
    torch.save(
        neighbor_embeddings,
        ensure_dir(path.join(BASE_DIR, prefix + "_neighbor_embeddings.pt")))
    torch.save(edge_type,
               ensure_dir(path.join(BASE_DIR, prefix + "_edge_type.pt")))
    torch.save(mask_neigh,
               ensure_dir(path.join(BASE_DIR, prefix + "_mask_neighbor.pt")))

    train_dataset, eval_dataset, test_dataset = split_training_test_dataset(
        list(range(input_embeddings.size(0))), e_t_size=1000)
    torch.save(train_dataset, path.join(BASE_DIR,
                                        prefix + "_train_dataset.pt"))

예제 #27

0

파일 보기

    artists           = artists[:NUMBER_OF_MAX_ARTISTS]
    number_of_fetches = NUMBER_OF_MAX_ARTISTS * 2 + (NUMBER_OF_MAX_ARTISTS * NUMBER_OF_ALBUMS) * (1 + NUMBER_OF_MAX_TRACKS)

    if VERBOSE:
        helper.log_highlight('You will have ' + str(number_of_fetches) + ' queries to the musixmatch api')
        print ''
        print 'Artist queries: ' + str(NUMBER_OF_MAX_ARTISTS)
        print 'Album queries:  ' +  str(NUMBER_OF_MAX_ARTISTS)
        print 'Track queries:  ' + str(NUMBER_OF_MAX_ARTISTS * NUMBER_OF_ALBUMS)
        print 'Lyrics queries: ' + str((NUMBER_OF_MAX_ARTISTS * NUMBER_OF_ALBUMS) * NUMBER_OF_MAX_TRACKS)
        print ''
        print 'These numbers can vary if an artists has less albums, tracks or tracks with lyrics'
        print ''

    helper.ensure_dir(OUTPUT_DIR_MUSIXMATCH)

    # live fetching
    # fetched_artist_ids = get_artist_ids(artists)
    # save_txt(fetched_artist_ids, 'artist_ids.txt')

    # fetched_artist_album_ids = get_artist_albums(fetched_artist_ids, NUMBER_OF_ALBUMS)
    # save_txt(fetched_artist_album_ids, 'album_ids.txt')

    # fetched_artist_album_tracks = get_artist_album_tracks(fetched_artist_album_ids, NUMBER_OF_MAX_TRACKS)
    # save_txt(fetched_artist_album_tracks, 'album_tracks.txt')
    # fetched_lyrics              = get_lyrics_by_tracks(fetched_artist_album_tracks)

    # fetching with stored data
    # fetched_artist_ids          = read_txt(GENERATED_ARTISTS_FILE)
    # fetched_artist_album_ids    = read_txt(GENERATED_ALBUM_IDS_FILE, True)

예제 #28

0

파일 보기

파일: download_data.py 프로젝트: andompesta/awsome_crypto_arbitrage

    for excange in dic["Data"]["Exchanges"]
]
market = sorted(market, key=lambda d: -d.volume)
for excange_info in market:
    print("{}\t{}".format(excange_info.market, excange_info.volume))

# download daily OHLC price-series for ETH/USD for a given 'market'
# extract close-price (cp)
print("{}/{}".format(fsym, tsym))
good_market_name = []

data = pd.DataFrame()
for market in map(lambda m: m.market, market):
    print("{}".format(market), end="")
    df = fetch_data_hour_by_exchange(fsym, tsym, market)
    df = df[(df.index > "2017-06-01") & (df.index <= "2017-11-05")]
    if df.shape[0] != 0:
        df.name = market
        df.to_csv(
            ensure_dir(path_join("./data", "{}_hourly.csv".format(market))))
        data = pd.concat([data, df], axis=1, ignore_index=False)
        print("\tdownloaded")
        good_market_name.append(market)
        if len(good_market_name) == 10:
            break
    else:
        print("\tskipp")

print(good_market_name)
print(data.head(10))
print(data.tail(10))

예제 #29

0

파일 보기

파일: dataset_creation.py 프로젝트: andompesta/bank_credit_scoring

    for _id in eval_stations:
        eval_dataset.extend(site_to_exp_idx.d[_id])

    for _id in test_stations:
        test_dataset.extend(site_to_exp_idx.d[_id])

    print("train len: {}\neval len: {}\ntest len: {}".format(len(train_dataset), len(eval_dataset), len(test_dataset)))

    return train_dataset, eval_dataset, test_dataset



if __name__ == "__main__":
    stations = read_stations()
    G, stations_distances = compute_graph(stations)
    torch.save(G, ensure_dir(path.join(BASE_DIR, "pems", "temp", "graph.pt")))
    torch.save(stations_distances, ensure_dir(path.join(BASE_DIR, "pems", "temp", "station_dist.pt")))

    # G = torch.load(path.join(BASE_DIR, "pems", "temp", "graph.pt"))
    #
    # input_embeddings, target_embeddings, neighbor_embeddings, edge_type, neigh_mask, station_id_to_idx, station_id_to_exp_idx = generate_embedding(stations, G)
    #
    # torch.save(input_embeddings, ensure_dir(path.join(BASE_DIR, "pems", "utility_input_embeddings.pt")))
    # torch.save(target_embeddings, ensure_dir(path.join(BASE_DIR, "pems", "target_embeddings.pt")))
    # torch.save(neighbor_embeddings, ensure_dir(path.join(BASE_DIR, "pems", "neighbor_embeddings.pt")))
    # torch.save(edge_type, ensure_dir(path.join(BASE_DIR, "pems", "edge_type.pt")))
    # torch.save(neigh_mask, ensure_dir(path.join(BASE_DIR, "pems", "mask_neighbor.pt")))
    # torch.save(station_id_to_idx, ensure_dir(path.join(BASE_DIR, "pems", "station_id_to_idx.pt")))
    # torch.save(station_id_to_exp_idx, ensure_dir(path.join(BASE_DIR, "pems", "station_id_to_exp_idx.pt")))
    #
    # station_id_to_idx = torch.load(path.join(BASE_DIR, "pems", "station_id_to_idx.pt"))

예제 #30

0

파일 보기

파일: dataset_creation.py 프로젝트: andompesta/bank_credit_scoring

    sites_correlation = pickle.load(
        open(path.join(BASE_DIR, "utility", "temp", "neighbors.bin"), "rb"))
    tz_onehot = pickle.load(
        open(path.join(BASE_DIR, "utility", "temp", "tz_onehot.bin"), "rb"))

    input_embeddings, target_embeddings, neighbor_embeddings, edge_types, neigh_mask, site_to_idx, site_to_exp_idx = generate_embedding(
        sites_normalized_dataframe,
        sites_info,
        sites_correlation,
        days_onehot,
        tz_onehot,
        seq_len=16)

    torch.save(
        input_embeddings,
        ensure_dir(
            path.join(BASE_DIR, "utility", "utility_input_embeddings.pt")))
    torch.save(
        target_embeddings,
        ensure_dir(path.join(BASE_DIR, "utility", "target_embeddings.pt")))
    torch.save(
        neighbor_embeddings,
        ensure_dir(path.join(BASE_DIR, "utility", "neighbor_embeddings.pt")))
    torch.save(edge_types,
               ensure_dir(path.join(BASE_DIR, "utility", "edge_type.pt")))
    torch.save(neigh_mask,
               ensure_dir(path.join(BASE_DIR, "utility", "mask_neighbor.pt")))
    torch.save(site_to_idx,
               ensure_dir(path.join(BASE_DIR, "utility", "site_to_idx.pt")))
    torch.save(
        site_to_exp_idx,
        ensure_dir(path.join(BASE_DIR, "utility", "site_to_exp_idx.pt")))

예제 #31

0

파일 보기

파일: main.py 프로젝트: andompesta/bank_credit_scoring

                        eval_dataloader, input_embeddings, target_embeddings,
                        neighbor_embeddings, edge_types, mask_neighbor, device)
                    eval_loss.append(iter_eval)

                    vis.line(Y=np.array(eval_loss),
                             X=np.array(range(0, i_iter + 1, args.eval_step)),
                             opts=dict(legend=["RMSE"],
                                       title=model.name + " eval loos",
                                       showlegend=True),
                             win="win:eval-{}".format(EXP_NAME))

                    torch.save(
                        saved_weights,
                        ensure_dir(
                            path.join(
                                "data", args.data_dir, model.name,
                                "{}_new_saved_eval_iter-{}_temp-{}.bin".format(
                                    args.dataset_prefix,
                                    int(i_iter / args.eval_step), args.temp))))

                    # pickle.dump(saved_weights, open(ensure_dir(path.join(args.data_dir, model.name, "{}saved_eval_iter-{}_temp-{}.bin".format(args.dataset_prefix, int(i_iter/args.eval_step), args.temp))), "wb"))

                    if best_model > iter_eval:
                        print("save best model")
                        best_model = iter_eval
                        torch.save(
                            model,
                            path.join("data", args.data_dir,
                                      "{}.pt".format(model.name)))

            # test performance
            model = torch.load(

예제 #32

0

파일 보기

    print('Completed extracting case data from ' + f_name)

    # Create n_grams
    n_gram_dir = generate_n_grams.process_dir(case_data_dir, 2, 4, b_verbose=True, b_size=2)

    print('Completed generating n_grams from ' + f_name)

    helper.move_dir(n_gram_dir, os.path.join(save_dir, f_name))

    print('Completed processing ' + str(f_name) + ' in ' + str(time.time() - start) + ' (s)')

    helper.delete_dir(f_name)


def main():

    files = [x for x in helper.get_files('.') if x.endswith('_complete.zip')]

    num_cores = multiprocessing.cpu_count()

    Parallel(n_jobs=num_cores)(delayed(process_file)(f) for f in files)


if __name__ == '__main__':

    save_dir = 'data'
    helper.ensure_dir(save_dir)

    main()