def __init__(self, args, reduced_size=None, info={}): super(CNN, self).__init__() # disc_type=DISC_TYPE_MATRIX self.disc_type = disc_type = args.disc_type self.layer1 = nn.Sequential( nn.Conv2d(1, 4, kernel_size=2, padding=0), nn.ReLU()) # 1,4,3,3 self.layer2 = nn.Sequential( nn.Conv2d(4, 8, kernel_size=2), nn.ReLU()) # 1,8,2,2 ## but for 5 lines, it is 1,8,3,3 if args.data_type == "sonnet_endings": self.scorer = nn.Linear(2 * 2 * 8, 1) elif args.data_type == "limerick": self.scorer = nn.Linear(3 * 3 * 8, 1) self.predictor = nn.Sigmoid() self.args = args self.use_cuda = args.use_cuda ## self.g_indexer = Indexer(args) self.g_indexer.load('tmp/tmp_' + args.g2p_model_name + '/solver_g_indexer') self.g2pmodel = Model(H=info['H'], args=args, i_size=self.g_indexer.w_cnt, o_size=self.g_indexer.w_cnt, start_idx=self.g_indexer.w2idx[utils.START]) if not args.learn_g2p_encoder_from_scratch: print("=====" * 7, "LOADING g2p ENCODER PRETRAINED") model_dir = 'tmp/tmp_' + args.g2p_model_name + '/' state_dict_best = torch.load(model_dir + 'model_best') self.g2pmodel.load_state_dict(state_dict_best) if not args.trainable_g2p: assert not args.learn_g2p_encoder_from_scratch for param in self.g2pmodel.parameters(): param.requires_grad = False
class Crawler: def __init__(self, list_file): self.logger = Logger.get_logger(utils.get_fullname(self)) self.list_file = list_file self.indexer = Indexer() self._client = None # return raw list in list format def parse_list(self, list_file): try: self.logger.info('Opening RSS file: %s' % list_file) f = open(list_file, 'r') except IOError: self.logger.error('Cannot read file: %s' % list_file) return -1 self.feeds_list = [] line = f.readline() while line: self.logger.debug('Reading: %s' % line) feeds = feedparser.parse(line) try: # default only get the latest entry raw_f = feeds['entries'][0] feed_item = EzrssFeed(raw_f.link) feed_item.parse_name(raw_f.summary or raw_f.value) feed_item.parse_season(raw_f.summary or raw_f.value) self.feeds_list.append(feed_item) except IndexError: pass line = f.readline() @property def client(self): return self._client @client.setter def client(self, c): self._client = c def run(self): self.parse_list(self.list_file) self.logger.info('Start checking latest RSS feeds') for feed in self.feeds_list: if feed.name: if not self.indexer.episode_exists(feed.name, feed.url): save_path = os.path.join(getattr(settings, 'SAVE_DIR'), feed.name, feed.season) self.client.start_from_url(feed.url, save_path) self.indexer.save(feed.name, feed.url) else: continue self.logger.info('Exiting application')
def from_word_vectors(cls, word_vectors, unique_labels): """Instantiate the vectorizer""" review_vocab = word_vectors rating_vocab = Indexer() # Add ratings for l in unique_labels: rating_vocab.add_and_get_index(l) return cls(review_vocab, rating_vocab)
def __init__(self, max_word_length): vocab = [chr(ord('a') + i) for i in range(0, 26)] + [' '] self.char_vocab_index = Indexer() self.char_vocab_index.add_and_get_index(PAD_TOKEN) # PAD is 0 self.char_vocab_index.add_and_get_index( UNK_TOKEN) # Unknown token is 1 for char in vocab: self.char_vocab_index.add_and_get_index(char) self.max_word_length = max_word_length
def __init__(self, args): super().__init__() self.args = args self.pad_token_id = args.pad_token_id # Initialize embedding layer (1) self.embedding = nn.Embedding(args.vocab_size, args.embedding_dim) # Initialize char embedding layer self.char_embedding = nn.Embedding(args.char_vocab_size, args.char_embedding_dim) # Initialize Context2Query (2) self.aligned_att = AlignedAttention(args.embedding_dim, args.char_embedding_dim) rnn_cell = nn.LSTM if args.rnn_cell_type == 'lstm' else nn.GRU # Initialize passage encoder (3) self.passage_rnn = rnn_cell( args.embedding_dim * 2, args.hidden_dim, bidirectional=args.bidirectional, batch_first=True, ) # Initialize question encoder (4) self.question_rnn = rnn_cell( args.embedding_dim, args.hidden_dim, bidirectional=args.bidirectional, batch_first=True, ) self.dropout = nn.Dropout(self.args.dropout) # Adjust hidden dimension if bidirectional RNNs are used _hidden_dim = (args.hidden_dim * 2 if args.bidirectional else args.hidden_dim) # Initialize attention layer for question attentive sum (5) self.question_att = SpanAttention(_hidden_dim) # Initialize bilinear layer for start positions (6) self.start_output = BilinearOutput(_hidden_dim, _hidden_dim) # Initialize bilinear layer for end positions (7) self.end_output = BilinearOutput(_hidden_dim, _hidden_dim) # Initialize char indexer vocab = [chr(ord('a') + i) for i in range(0, 26)] + [' '] self.char_vocab_index = Indexer() for char in vocab: self.char_vocab_index.add_and_get_index(char)
def read_word_embeddings(embeddings_file: str) -> WordEmbeddings: """ Loads the given embeddings (ASCII-formatted) into a WordEmbeddings object. Augments this with an UNK embedding that is the 0 vector. Reads in all embeddings with no filtering -- you should only use this for relativized word embedding files. :param embeddings_file: path to the file containing embeddings :return: WordEmbeddings object reflecting the words and their embeddings """ f = open(embeddings_file) word_indexer = Indexer() vectors = [] # Make position 0 a PAD token, which can be useful if you word_indexer.add_and_get_index("PAD") # Make position 1 the UNK token word_indexer.add_and_get_index("UNK") for line in f: if line.strip() != "": space_idx = line.find(' ') word = line[:space_idx] numbers = line[space_idx + 1:] float_numbers = [ float(number_str) for number_str in numbers.split() ] vector = np.array(float_numbers) word_indexer.add_and_get_index(word) # Append the PAD and UNK vectors to start. Have to do this weirdly because we need to read the first line # of the file to see what the embedding dim is if len(vectors) == 0: vectors.append(np.zeros(vector.shape[0])) vectors.append(np.zeros(vector.shape[0])) vectors.append(vector) f.close() # Turn vectors into a 2-D numpy array return WordEmbeddings(word_indexer, np.array(vectors))
class CharTokenizer: """ Class to create char tokens """ def __init__(self, max_word_length): vocab = [chr(ord('a') + i) for i in range(0, 26)] + [' '] self.char_vocab_index = Indexer() self.char_vocab_index.add_and_get_index(PAD_TOKEN) # PAD is 0 self.char_vocab_index.add_and_get_index( UNK_TOKEN) # Unknown token is 1 for char in vocab: self.char_vocab_index.add_and_get_index(char) self.max_word_length = max_word_length def convert_words_to_charids(self, words): word_charids = [] for w in words: charids = [] for c in w: charids.append(self.char_vocab_index.index_of(c)) charids = charids[:self.max_word_length] if len(charids) < self.max_word_length: charids.extend([0] * (self.max_word_length - len(charids))) word_charids.append(charids) return word_charids
def load(self, specialTokenList=None): indexer = Indexer(specialTokenList) print "... loading training data." trainPairs,trainLens = self._load_pairs(indexer, self.dataDict['train_source'], self.dataDict['train_target']) print "... loading test data." testPairs,testLens = self._load_pairs(indexer, self.dataDict['test_source'], self.dataDict['test_source']) print "Done!\n" return indexer,trainPairs,trainLens,testPairs,testLens
def generate_indexer(usr_dataset, usr_bm_tg, feature_begin, feature_end): logging.info('generating indexer ...') indexer = Indexer(['user', 'tag', 'bookmark']) min_time = 1e30 max_time = -1 for line in usr_dataset[1:]: line_items = line.split('\t') contact_timestamp = float(line_items[2]) / 1000 min_time = min(min_time, contact_timestamp) max_time = max(max_time, contact_timestamp) if feature_begin < contact_timestamp <= feature_end: indexer.index('user', line_items[0]) indexer.index('user', line_items[1]) for line in usr_bm_tg[1:]: line_items = line.split('\t') tag_timestamp = float(line_items[3]) / 1000 if feature_begin < tag_timestamp <= feature_end: indexer.index('user', line_items[0]) indexer.index('bookmark', line_items[1]) indexer.index('tag', line_items[2]) with open('delicious/data/metadata.txt', 'w') as output: output.write('Nodes:\n') output.write('-----------------------------\n') output.write('#Users: %d\n' % indexer.indices['user']) output.write('#Tags: %d\n' % indexer.indices['tag']) output.write('#Bookmarks: %d\n' % indexer.indices['bookmark']) output.write('\nEdges:\n') output.write('-----------------------------\n') output.write('#Contact: %d\n' % len(usr_dataset)) output.write('#Save : %d\n' % len(usr_bm_tg)) output.write('#Attach: %d\n' % len(usr_bm_tg)) output.write('\nTime Span:\n') output.write('-----------------------------\n') output.write('From: %s\n' % datetime.fromtimestamp(min_time)) output.write('To: %s\n' % datetime.fromtimestamp(max_time)) return indexer
class FeatureExtractor(): def __init__(self): self.indexer = Indexer() def get_indexer(self): return self.indexer def extract_features(self, ex): feature_vector = np.zeros(len(self.indexer)) for word in ex.text: index = self.indexer.index_of(word) feature_vector[index] += 1 return feature_vector
contains a header line and 45463 data lines, each line includes a mId and its overview (some sentences). ''' movies.to_csv("processed_data/overviews.csv", columns=['mId', 'overview'], index=False) movies.to_csv("processed_data/mId2Title.csv", columns=['mId', 'tmdbId', 'title'], index=False) ''' create genres mId2Genre: 45463 lines, each line includes (mId, num of genres, gIds) Genre2Id: 20 lines, each line includes (gId, genre name) gId ranges from 45843 to 45862 ''' f = open("processed_data/mId2Genre.txt", "w") genreIdx = Indexer() for idx, row in movies.iterrows(): mId, raw_genres = row['mId'], row['genres'] raw_genres = raw_genres.replace("\'", "\"") genres_l = json.loads(raw_genres) f.write("%d %d" % (mId, len(genres_l))) for g in genres_l: f.write(" %d" % (genreIdx.add_and_get_index(g['name']) + id_base)) f.write("\n") f.close() f = open("processed_data/Genre2Id.txt", "w") num_genres = len(genreIdx) for i in range(num_genres): f.write("%d %s\n" % (i + id_base, genreIdx.get_object(i))) f.close()
labels, emoji_labels = get_labels(tweet, indexer) label = get_most_recent_label(tweet, emoji_labels, indexer) except: continue cleaned_text = clean_tweet(tweet) datapoint = DataPoint(cleaned_text, label) dataset.append(datapoint) label_counter[indexer.get_object(label)] += 1 count += 1 if count % 500000 == 0: print("created", count, "datapoints") return dataset indexer = Indexer() label_counter = Counter() dataset = create_dataset(tweets, indexer, label_counter print ("length of dataset: ", len(dataset)) from tokenizer import tokenizer as vinay v = vinay.TweetTokenizer(regularize=True, preserve_len=False) word_cnts = Counter() def count_words(text): words = v.tokenize(text) for word in words: word_cnts[word] += 1
class CharBaselineReader(nn.Module): """ Baseline QA Model [Architecture] 0) Inputs: passages and questions 1) Embedding Layer: converts words to vectors 2) Context2Query: computes weighted sum of question embeddings for each position in passage. 3) Passage Encoder: LSTM or GRU. 4) Question Encoder: LSTM or GRU. 5) Question Attentive Sum: computes weighted sum of question hidden. 6) Start Position Pointer: computes scores (logits) over passage conditioned on the question vector. 7) End Position Pointer: computes scores (logits) over passage conditioned on the question vector. Args: args: `argparse` object. Inputs: batch: a dictionary containing batched tensors. { 'passages': LongTensor [batch_size, p_len], 'questions': LongTensor [batch_size, q_len], 'start_positions': Not used in `forward`, 'end_positions': Not used in `forward`, } Returns: Logits for start positions and logits for end positions. Tuple: ([batch_size, p_len], [batch_size, p_len]) """ def __init__(self, args): super().__init__() self.args = args self.pad_token_id = args.pad_token_id # Initialize embedding layer (1) self.embedding = nn.Embedding(args.vocab_size, args.embedding_dim) # Initialize char embedding layer self.char_embedding = nn.Embedding(args.char_vocab_size, args.char_embedding_dim) # Initialize Context2Query (2) self.aligned_att = AlignedAttention(args.embedding_dim, args.char_embedding_dim) rnn_cell = nn.LSTM if args.rnn_cell_type == 'lstm' else nn.GRU # Initialize passage encoder (3) self.passage_rnn = rnn_cell( args.embedding_dim * 2, args.hidden_dim, bidirectional=args.bidirectional, batch_first=True, ) # Initialize question encoder (4) self.question_rnn = rnn_cell( args.embedding_dim, args.hidden_dim, bidirectional=args.bidirectional, batch_first=True, ) self.dropout = nn.Dropout(self.args.dropout) # Adjust hidden dimension if bidirectional RNNs are used _hidden_dim = (args.hidden_dim * 2 if args.bidirectional else args.hidden_dim) # Initialize attention layer for question attentive sum (5) self.question_att = SpanAttention(_hidden_dim) # Initialize bilinear layer for start positions (6) self.start_output = BilinearOutput(_hidden_dim, _hidden_dim) # Initialize bilinear layer for end positions (7) self.end_output = BilinearOutput(_hidden_dim, _hidden_dim) # Initialize char indexer vocab = [chr(ord('a') + i) for i in range(0, 26)] + [' '] self.char_vocab_index = Indexer() for char in vocab: self.char_vocab_index.add_and_get_index(char) def load_pretrained_embeddings(self, vocabulary, path): """ Loads GloVe vectors and initializes the embedding matrix. Args: vocabulary: `Vocabulary` object. path: Embedding path, e.g. "glove/glove.6B.300d.txt". """ if self.args.embedding == 'glove': embedding_map = load_cached_embeddings(path) # Create embedding matrix. By default, embeddings are randomly # initialized from Uniform(-0.1, 0.1). embeddings = torch.zeros( (len(vocabulary), self.args.embedding_dim)).uniform_(-0.1, 0.1) # Initialize pre-trained embeddings. num_pretrained = 0 for (i, word) in enumerate(vocabulary.words): if word in embedding_map: #embeddings[i] = torch.tensor(embedding_map[word]) num_pretrained += 1 # Place embedding matrix on GPU. self.embedding.weight.data = cuda(self.args, embeddings) else: ##################### # Loads Fasttext embeddings embedding_map = load_fasttext_embeddings(path) # Create embedding matrix. By default, embeddings are randomly # initialized from Uniform(-0.1, 0.1). embeddings = torch.zeros( (len(vocabulary), self.args.embedding_dim)).uniform_(-0.1, 0.1) # Initialize pre-trained embeddings. num_pretrained = 0 for (i, word) in enumerate(vocabulary.words): embeddings[i] = torch.tensor( embedding_map.get_word_vector(word)) num_pretrained += 1 # Place embedding matrix on GPU. self.embedding.weight.data = cuda(self.args, embeddings) return num_pretrained def sorted_rnn(self, sequences, sequence_lengths, rnn): """ Sorts and packs inputs, then feeds them into RNN. Args: sequences: Input sequences, [batch_size, len, dim]. sequence_lengths: Lengths for each sequence, [batch_size]. rnn: Registered LSTM or GRU. Returns: All hidden states, [batch_size, len, hid]. """ # Sort input sequences sorted_inputs, sorted_sequence_lengths, restoration_indices = _sort_batch_by_length( sequences, sequence_lengths) # Pack input sequences packed_sequence_input = pack_padded_sequence( sorted_inputs, sorted_sequence_lengths.data.long().tolist(), batch_first=True) # Run RNN packed_sequence_output, _ = rnn(packed_sequence_input, None) # Unpack hidden states unpacked_sequence_tensor, _ = pad_packed_sequence( packed_sequence_output, batch_first=True) # Restore the original order in the batch and return all hidden states return unpacked_sequence_tensor.index_select(0, restoration_indices) def forward(self, batch): # Obtain masks and lengths for passage and question. passage_mask = (batch['passages'] != self.pad_token_id ) # [batch_size, p_len] question_mask = (batch['questions'] != self.pad_token_id ) # [batch_size, q_len] passage_lengths = passage_mask.long().sum(-1) # [batch_size] question_lengths = question_mask.long().sum(-1) # [batch_size] # 1) Embedding Layer: Embed the passage and question. passage_embeddings = self.embedding( batch['passages']) # [batch_size, p_len, p_dim] question_embeddings = self.embedding( batch['questions']) # [batch_size, q_len, q_dim] passage_char_embeddings = self.char_embedding( batch['char_passages'] ) # [batch_size, p_len, word_length, word_dim] [64, 168, 16, 64] question_char_embeddings = self.char_embedding( batch['char_questions'] ) # [batch_size, q_len, word_length, word_dim] if self.args.char_embedding_type == 'average': # Average char embeddings baseline passage_char_embeddings_avg = passage_char_embeddings.mean( dim=2).squeeze(0) question_char_embeddings_avg = question_char_embeddings.mean( dim=2).squeeze(0) passage_final_embeddings = torch.cat( [passage_embeddings, passage_char_embeddings_avg], dim=2) question_final_embeddings = torch.cat( [question_embeddings, question_char_embeddings_avg], dim=2) #print('passage_char_embeddings ', passage_char_embeddings.shape) #print('question_char_embeddings ', question_char_embeddings.shape) else: # Conv 1D char embeddings passage_char_embeddings_conv1d_input = passage_char_embeddings.reshape( (-1, passage_char_embeddings.shape[3], passage_char_embeddings.shape[2])) question_char_embeddings_conv1d_input = question_char_embeddings.reshape( (-1, question_char_embeddings.shape[3], question_char_embeddings.shape[2])) conv1d = torch.nn.Conv1d(self.args.char_embedding_dim, self.args.char_embedding_dim, 3) relu = torch.nn.ReLU() if torch.cuda.is_available(): conv1d.cuda() relu.cuda() passage_char_embeddings_tmp1 = relu( conv1d(passage_char_embeddings_conv1d_input)) # Last dimension of conv1d output we want to collapse using global max pooling passage_char_embeddings_final = torch.nn.functional.max_pool1d( passage_char_embeddings_tmp1, passage_char_embeddings_tmp1.shape[2]).squeeze(2).reshape( passage_char_embeddings.shape[0], passage_char_embeddings.shape[1], -1) question_char_embeddings_tmp1 = relu( conv1d(question_char_embeddings_conv1d_input)) # Last dimension of conv1d output we want to collapse using global max pooling question_char_embeddings_final = torch.nn.functional.max_pool1d( question_char_embeddings_tmp1, question_char_embeddings_tmp1.shape[2]).squeeze(2).reshape( question_char_embeddings.shape[0], question_char_embeddings.shape[1], -1) passage_final_embeddings = torch.cat( [passage_embeddings, passage_char_embeddings_final], dim=2) question_final_embeddings = torch.cat( [question_embeddings, question_char_embeddings_final], dim=2) # 2) Context2Query: Compute weighted sum of question embeddings for # each passage word and concatenate with passage embeddings. aligned_scores = self.aligned_att( passage_final_embeddings, question_final_embeddings, ~question_mask) # [batch_size, p_len, q_len] aligned_embeddings = aligned_scores.bmm( question_embeddings) # [batch_size, p_len, q_dim] passage_embeddings = cuda( self.args, torch.cat((passage_embeddings, aligned_embeddings), 2), ) # [batch_size, p_len, p_dim + q_dim] # 3) Passage Encoder passage_hidden = self.sorted_rnn( passage_embeddings, passage_lengths, self.passage_rnn) # [batch_size, p_len, p_hid] passage_hidden = self.dropout( passage_hidden) # [batch_size, p_len, p_hid] # 4) Question Encoder: Encode question embeddings. question_hidden = self.sorted_rnn( question_embeddings, question_lengths, self.question_rnn) # [batch_size, q_len, q_hid] # 5) Question Attentive Sum: Compute weighted sum of question hidden # vectors. question_scores = self.question_att(question_hidden, ~question_mask) question_vector = question_scores.unsqueeze(1).bmm( question_hidden).squeeze(1) question_vector = self.dropout(question_vector) # [batch_size, q_hid] # 6) Start Position Pointer: Compute logits for start positions start_logits = self.start_output(passage_hidden, question_vector, ~passage_mask) # [batch_size, p_len] # 7) End Position Pointer: Compute logits for end positions end_logits = self.end_output(passage_hidden, question_vector, ~passage_mask) # [batch_size, p_len] return start_logits, end_logits # [batch_size, p_len], [batch_size, p_len]
from solver import Solver from preprocess.tacotron.utils import spectrogram2wav #from preprocess.tacotron.audio import inv_spectrogram, save_wav from scipy.io.wavfile import write from preprocess.tacotron.mcep import mc2wav if __name__ == '__main__': feature = 'sp' hps = Hps() hps.load('./hps/v19.json') hps_tuple = hps.get_tuple() solver = Solver(hps_tuple, None) solver.load_model('/storage/model/voice_conversion/v19/model.pkl-59999') if feature == 'mc': # indexer to extract data indexer = Indexer() src_mc = indexer.index(speaker_id='225', utt_id='366', dset='test', feature='norm_mc') tar_mc = indexer.index(speaker_id='226', utt_id='366', dset='test', feature='norm_mc') expand_src_mc = np.expand_dims(src_mc, axis=0) expand_tar_mc = np.expand_dims(tar_mc, axis=0) src_mc_tensor = torch.from_numpy(expand_src_mc).type(torch.FloatTensor) tar_mc_tensor = torch.from_numpy(expand_tar_mc).type(torch.FloatTensor) c1 = Variable(torch.from_numpy(np.array([0]))).cuda() c2 = Variable(torch.from_numpy(np.array([1]))).cuda() results = [src_mc]
from keras.models import Sequential from keras.layers import Dense, Embedding, LSTM, Dropout from sklearn.model_selection import train_test_split from keras.utils.np_utils import to_categorical import re import json import pandas as pd from utils import get_train_data_from_csv, get_dev_data_from_csv, get_test_data_from_csv, Indexer, get_indexer from nltk.tokenize import TweetTokenizer from sklearn.metrics import classification_report include_test = True tknr = TweetTokenizer() indexer = get_indexer('indexer_15_dups.csv') word_indexer = Indexer() word_indexer.add_and_get_index("UNK") train_data = get_train_data_from_csv('data/train_15_ds.csv')[0:1000] dev_data = get_dev_data_from_csv('data/dev_15_ds.csv')[:200] test_data = get_test_data_from_csv('data/test_15_ds.csv')[0:200] X_train = [] Y_train = [] X_dev = [] Y_dev = [] Y_dev_true = [] X_test = [] Y_test = [] Y_test_true = []
def __init__(self): self.indexer = Indexer()
def generate_papers(datafile, feature_begin, feature_end, observation_begin, observation_end, conf_list): logging.info('generating papers ...') # try: # result = pickle.load(open('dblp/data/papers_%s.pkl' % path, 'rb')) # return result # except IOError: # pass indexer = Indexer(['author', 'paper', 'term', 'venue']) index, authors, title, year, venue = None, None, None, None, None references = [] write = 0 cite = 0 include = 0 published = 0 min_year = 3000 max_year = 0 papers_feature_window = [] papers_observation_window = [] with open(datafile) as file: dataset = file.read().splitlines() for line in dataset: if not line: if year and venue: year = int(year) if year > 0 and authors and venue in conf_list: min_year = min(min_year, year) max_year = max(max_year, year) authors = authors.split(',') terms = parse_term(title) write += len(authors) cite += len(references) include += len(terms) published += 1 p = Paper(year) if feature_begin < year <= feature_end: p.id = indexer.index('paper', index) p.terms = [ indexer.index('term', term) for term in terms ] p.references = [ indexer.index('paper', paper_id) for paper_id in references ] p.authors = [ indexer.index('author', author_name) for author_name in authors ] p.venue = indexer.index('venue', venue) bisect.insort(papers_feature_window, p) elif observation_begin < year <= observation_end: p.references = references p.authors = authors papers_observation_window.append(p) index, authors, title, year, venue = None, None, None, None, None references = [] else: begin = line[1] if begin == '*': title = line[2:] elif begin == '@': authors = line[2:] elif begin == 't': year = line[2:] elif begin == 'c': venue = line[2:] elif begin == 'i': index = line[6:] elif begin == '%': references.append(line[2:]) for p in papers_observation_window: authors = [] references = [] for author in p.authors: author_id = indexer.get_index('author', author) if author_id is not None: authors.append(author_id) for ref in p.references: paper_id = indexer.get_index('paper', ref) if paper_id is not None: references.append(paper_id) p.authors = authors p.references = references with open('dblp/data/metadata_%s.txt' % path, 'w') as output: output.write('Nodes:\n') output.write('-----------------------------\n') output.write('#Authors: %d\n' % indexer.indices['author']) output.write('#Papers: %d\n' % indexer.indices['paper']) output.write('#Venues: %d\n' % indexer.indices['venue']) output.write('#Terms: %d\n\n' % indexer.indices['term']) output.write('\nEdges:\n') output.write('-----------------------------\n') output.write('#Write: %d\n' % write) output.write('#Cite: %d\n' % cite) output.write('#Publish: %d\n' % published) output.write('#Contain: %d\n' % include) output.write('\nTime Span:\n') output.write('-----------------------------\n') output.write('From: %s\n' % min_year) output.write('To: %s\n' % max_year) result = papers_feature_window, papers_observation_window, indexer.indices # pickle.dump(result, open('dblp/data/papers_%s.pkl' % path, 'wb')) return result
def generate_indexer(user_rates_movies_ds, user_tags_movies_ds, movie_actor_ds, movie_director_ds, movie_genre_ds, movie_countries_ds, feature_begin, feature_end): logging.info('generating indexer ...') min_time = 1e30 max_time = -1 indexer = Indexer( ['user', 'tag', 'movie', 'actor', 'director', 'genre', 'country']) for line in user_rates_movies_ds[1:]: line_items = line.split('\t') rating_timestamp = float(line_items[3]) / 1000 min_time = min(min_time, rating_timestamp) max_time = max(max_time, rating_timestamp) rating = float(line_items[2]) if feature_begin < rating_timestamp <= feature_end and rating > rating_threshold: indexer.index('user', line_items[0]) indexer.index('movie', line_items[1]) for line in user_tags_movies_ds[1:]: line_items = line.split('\t') tag_timestamp = float(line_items[3]) / 1000 if feature_begin < tag_timestamp <= feature_end: indexer.index('user', line_items[0]) indexer.index('movie', line_items[1]) indexer.index('tag', line_items[2]) for line in movie_actor_ds[1:]: line_items = line.split('\t') ranking = int(line_items[3]) if ranking < actor_threshold and line_items[0] in indexer.mapping[ 'movie']: # indexer.index('movie', line_items[0]) indexer.index('actor', line_items[1]) for line in movie_director_ds[1:]: line_items = line.split('\t') if line_items[0] in indexer.mapping['movie']: # indexer.index('movie', line_items[0]) indexer.index('director', line_items[1]) for line in movie_genre_ds[1:]: line_items = line.split('\t') if line_items[0] in indexer.mapping['movie']: # indexer.index('movie', line_items[0]) indexer.index('genre', line_items[1]) for line in movie_countries_ds[1:]: line_items = line.split('\t') if line_items[0] in indexer.mapping['movie']: # indexer.index('movie', line_items[0]) indexer.index('country', line_items[1]) with open('movielens/data/metadata.txt', 'w') as output: output.write('Nodes:\n') output.write('-----------------------------\n') output.write('#Users: %d\n' % indexer.indices['user']) output.write('#Tags: %d\n' % indexer.indices['tag']) output.write('#Movies: %d\n' % indexer.indices['movie']) output.write('#Actors: %d\n' % indexer.indices['actor']) output.write('#Director: %d\n' % indexer.indices['director']) output.write('#Genre: %d\n' % indexer.indices['genre']) output.write('#Countriy: %d\n' % indexer.indices['country']) output.write('\nEdges:\n') output.write('-----------------------------\n') output.write('#Rate: %d\n' % len(user_rates_movies_ds)) output.write('#Attach: %d\n' % len(user_tags_movies_ds)) output.write('#Played_by: %d\n' % len(movie_actor_ds)) output.write('#Directed_by : %d\n' % len(movie_director_ds)) output.write('#Has: %d\n' % len(movie_genre_ds)) output.write('#Produced_in: %d\n' % len(movie_countries_ds)) output.write('\nTime Span:\n') output.write('-----------------------------\n') output.write('From: %s\n' % datetime.fromtimestamp(min_time)) output.write('To: %s\n' % datetime.fromtimestamp(max_time)) return indexer
class CNN(nn.Module): def __init__(self, args, reduced_size=None, info={}): super(CNN, self).__init__() # disc_type=DISC_TYPE_MATRIX self.disc_type = disc_type = args.disc_type self.layer1 = nn.Sequential( nn.Conv2d(1, 4, kernel_size=2, padding=0), nn.ReLU()) # 1,4,3,3 self.layer2 = nn.Sequential( nn.Conv2d(4, 8, kernel_size=2), nn.ReLU()) # 1,8,2,2 ## but for 5 lines, it is 1,8,3,3 if args.data_type == "sonnet_endings": self.scorer = nn.Linear(2 * 2 * 8, 1) elif args.data_type == "limerick": self.scorer = nn.Linear(3 * 3 * 8, 1) self.predictor = nn.Sigmoid() self.args = args self.use_cuda = args.use_cuda ## self.g_indexer = Indexer(args) self.g_indexer.load('tmp/tmp_' + args.g2p_model_name + '/solver_g_indexer') self.g2pmodel = Model(H=info['H'], args=args, i_size=self.g_indexer.w_cnt, o_size=self.g_indexer.w_cnt, start_idx=self.g_indexer.w2idx[utils.START]) if not args.learn_g2p_encoder_from_scratch: print("=====" * 7, "LOADING g2p ENCODER PRETRAINED") model_dir = 'tmp/tmp_' + args.g2p_model_name + '/' state_dict_best = torch.load(model_dir + 'model_best') self.g2pmodel.load_state_dict(state_dict_best) if not args.trainable_g2p: assert not args.learn_g2p_encoder_from_scratch for param in self.g2pmodel.parameters(): param.requires_grad = False def display_params(self): print("=" * 44) print("[CNN]: model parametrs") for name, param in self.named_parameters(): print("name=", name, " || grad:", param.requires_grad, "| size = ", param.size()) print("=" * 44) def _compute_word_reps(self, words_str, deb=False): if deb: print("words_str = ", words_str) use_eow_marker = self.args.use_eow_in_enc assert not use_eow_marker, "Not yet tested" word_reps = [self.g_indexer.w_to_idx(s1) for s1 in words_str] if self.args.use_eow_in_enc: x_end = self.g_indexer.w2idx[utils.END] word_reps = [x_i + [x_end] for x_i in word_reps] word_reps = [self.g2pmodel.encode(w) for w in word_reps] return word_reps def _compute_pairwise_dot(self, measure_encodings_b): ret = [] sz = len(measure_encodings_b) for measure_encodings_b_t in measure_encodings_b: for measure_encodings_b_t2 in measure_encodings_b: t1 = torch.sum(measure_encodings_b_t * measure_encodings_b_t2) t2 = torch.sqrt(torch.sum(measure_encodings_b_t * measure_encodings_b_t)) t3 = torch.sqrt(torch.sum(measure_encodings_b_t2 * measure_encodings_b_t2)) assert t2 > 0 assert t3 > 0, "t3=" + str(t3) ret.append(t1 / (t2 * t3)) ret = torch.stack(ret) ret = ret.view(sz, sz) return ret def _score_matrix(self, x, deb=False): x = x[0].unsqueeze(0).unsqueeze(0) # -> 1,1,ms,ms if deb: print("---x.shape = ", x.size()) out = self.layer1(x) if deb: print("---out = ", out.size(), out) out = self.layer2(out) if deb: print("---out = ", out.size(), out) out = out.view(out.size(0), -1) # arrange by bsz score = self.scorer(out) if deb: print("---out sum = ", torch.sum(out)) print("---score = ", score) prob = self.predictor(score) return {'prob': prob, 'out': out, 'score': score} def _compute_rhyming_matrix(self, words_str, deb=False): word_reps = self._compute_word_reps(words_str) rhyming_matrix = self._compute_pairwise_dot(word_reps) return rhyming_matrix, words_str def _compute_rnn_on_word_reps(self, word_reps): h = torch.zeros(1, self.linear_rep_H), torch.zeros(1, self.linear_rep_H) if self.use_cuda: h = h[0].cuda(), h[1].cuda() for w in word_reps: h = self.linear_rep_encoder(w, h) out, c = h return c def _run_discriminator(self, words_str, deb): rhyming_matrix, words_str = self._compute_rhyming_matrix(words_str, deb) vals = self._score_matrix([rhyming_matrix]) vals.update({'rhyming_matrix': rhyming_matrix, 'linear_rep': None, 'words_str': words_str}) return vals def update_discriminator(self, line_endings_gen, line_endings_train, deb=False, word_idx_to_str_dict=None): eps = 0.0000000001 ret = {} dump_info = {} words_str_train = [word_idx_to_str_dict[word_idx.data.cpu().item()] for word_idx in line_endings_train] words_str_gen = [word_idx_to_str_dict[word_idx.data.cpu().item()] for word_idx in line_endings_gen] disc_real = self._run_discriminator(words_str_train, deb) if deb: print("rhyming_matrix_trai = ", disc_real['rhyming_matrix'], "|| prob = ", disc_real['prob']) if self.args.disc_type == DISC_TYPE_MATRIX: dump_info['rhyming_matrix_trai'] = disc_real['rhyming_matrix'].data.cpu().numpy() dump_info['real_prob'] = disc_real['prob'].data.cpu().item() dump_info['real_words_str'] = disc_real['words_str'] disc_gen = self._run_discriminator(words_str_gen, deb) if deb: print("rhyming_matrix_gen = ", disc_gen['rhyming_matrix'], "|| prob = ", disc_gen['prob']) if self.args.disc_type == DISC_TYPE_MATRIX: dump_info['rhyming_matrix_gen'] = disc_gen['rhyming_matrix'].data.cpu().numpy() dump_info['gen_prob'] = disc_gen['prob'].data.cpu().item() dump_info['gen_words_str'] = disc_gen['words_str'] prob_real = disc_real['prob'] prob_gen = disc_gen['prob'] loss = -torch.log(prob_real + eps) - torch.log(1.0 - prob_gen + eps) reward = prob_gen if self.args.use_score_as_reward: reward = disc_gen['score'] ret.update({'loss': loss, 'reward': reward, 'dump_info': dump_info}) return ret
def __init__(self, list_file): self.logger = Logger.get_logger(utils.get_fullname(self)) self.list_file = list_file self.indexer = Indexer() self._client = None
reload(sys) sys.setdefaultencoding('utf-8') app = Flask(__name__) app.secret_key = b'_5#y2L"F4Qas5nb113@&B#(V!*#8z\n\xec]/' db = Database() recommender = Recommender() if not db.checkConnectivity(): print 'Unable to connect to database' sys.exit(-1) indexer = Indexer() @app.before_request def authenticateUser(): if request.endpoint != 'search' and request.endpoint != 'signIn' and 'userid' not in session: return redirect(url_for('signIn')) @app.route('/signout') def signOut(): session.clear() return redirect(url_for('signIn')) @app.route('/signin', methods=['GET', 'POST'])