def __init__(self, config, train_set=None): # Book-keeping. self.config = config if self.config['pretrained']: self.init_saved_network(self.config['pretrained']) else: assert train_set is not None print('Train vocab: {}'.format(len(train_set.vocab))) vocab = Counter() for w in train_set.vocab: if train_set.vocab[w] >= config['min_freq']: vocab[w] = train_set.vocab[w] print('Pruned train vocab: {}'.format(len(vocab))) # Building network. word_model = WordModel(embed_size=self.config['embed_size'], filename=self.config['embed_file'], embed_type=self.config['embed_type'], top_n=self.config['top_vocab'], additional_vocab=vocab) self.config['embed_size'] = word_model.embed_size self._init_new_network(train_set, word_model) num_params = 0 for name, p in self.network.named_parameters(): print('{}: {}'.format(name, str(p.size()))) num_params += p.numel() print('#Parameters = {}\n'.format(num_params)) self._init_optimizer()
def doc_parsing(doc): listField = [] listTable = [] fieldName = '' fileDesc = '' for doc_part in doc.element.body: if isinstance(doc_part, CT_P): pg = Paragraph(doc_part, doc).text if (pg.find('<table_name>') >= 0 and pg.find('</table_name>') > 0): fieldName = pg[pg.find('<table_name>') + 12:pg.find('</table_name>')] + '.java' fileDesc = pg[0:pg.find('<table_name>')] if (isinstance(doc_part, CT_Tbl) and fieldName != ''): tableinfo = TableInfo() tableinfo.fileName = fieldName tableinfo.fileDesc = fileDesc tb1 = Table(doc_part, doc) isMytable = doc_mytable(tb1) if (isMytable == False): continue for row in range(len(tb1.rows)): if (row == 0): continue w2 = WordModel() w2.field = getCellText( tb1, row, dict.get("field") if dict.has_key("field") else '') w2.fieldName = getCellText( tb1, row, dict.get("fieldName") if dict.has_key("fieldName") else '') w2.fieldType = getCellText( tb1, row, dict.get("fieldType") if dict.has_key("fieldType") else '') w2.comment = getCellText( tb1, row, dict.get("comment") if dict.has_key("comment") else '') w2.must = getCellText( tb1, row, dict.get("must") if dict.has_key("must") else '') # print w2.display() w2.fieldType = dataConvert(w2.fieldType) listField.append(w2) # for col in range(len(tb1.columns)): # cell_table = tb1.cell(row, col) # table_nested_parsing(cell_table, row, col) tableinfo.listField = listField listTable.append(tableinfo) fieldName = '' listField = [] return listTable
import coloredlogs, logging from encoder import Encoder from torch.autograd import Variable import torch import torch.nn as nn import torch.nn.functional as F # Create a logger object. logger = logging.getLogger(__name__) coloredlogs.install(level='DEBUG') coloredlogs.install(fmt='%(asctime)s,%(msecs)03d %(levelname)s %(message)s') dataset = json.load(open('data/dev-v1.1.json')) word_model = WordModel() logger.warning('Loading Vocab ...') word_model.load_vocab() vocab_size = word_model.vocab.length() encoder = Encoder(vocab_size=vocab_size) optimiser = torch.optim.SGD(encoder.parameters(), lr=0.0001) criterion = nn.NLLLoss() def train_model(context, question, answer, target_start, target_end): context, question, answer = Variable(context), Variable( question), Variable(answer) context = context.unsqueeze(0) question = question.unsqueeze(0) answer = answer.unsqueeze(0)
import json import nltk nltk.download('perluniprops') nltk.download('nonbreaking_prefixes') from word_model import Vocab, WordModel import pickle import coloredlogs, logging from encoder import Encoder from torch.autograd import Variable import torch import torch.nn as nn # Create a logger object. logger = logging.getLogger(__name__) coloredlogs.install(level='DEBUG') coloredlogs.install(fmt='%(asctime)s,%(msecs)03d %(levelname)s %(message)s') dataset = json.load(open('data/dev-v1.1.json')) word_model = WordModel() logger.warning('Generating Vocab ...') word_model.store_into_vocab(dataset)
headers = {"Content-Type": "application/json", "api_key": CORTICAL_API_KEY} response = requests.post(url, headers=headers, data=term) return json.loads(response.content).pop() def fingerprintToTerm(fingerprint): url = "http://api.cortical.io:80/rest/expressions/similarTerms?retinaName=en_associative" headers = {"Content-Type": "application/json", "api_key": CORTICAL_API_KEY} response = requests.post(url, headers=headers, data=json.dumps(fingerprint)) return json.loads(response.content) auth = tweepy.OAuthHandler(TWITTER_KEY, TWITTER_SECRET) auth.set_access_token(TWITTER_ACCESS_TOKEN, TWITTER_ACCESS_SECRET) model = WordModel() api = tweepy.API(auth) public_tweets = api.user_timeline("rhyolight", count=100) # print type(public_tweets) for tweet in public_tweets: # print tweet.text cleanTweet = cleanText(tweet.text) print cleanTweet sdr = termToBitmap(cleanTweet) # print sdr terms = fingerprintToTerm(sdr) print '\tclosest term: %s' % terms[0]['term']
# To convert words in the input to indices of the embeddings matrix word_to_idx = { word: i for i, word in enumerate(gensim_embeds.vocab.keys()) } # Set hyperparameters # Number of output classes (9) n_classes = len(TAG_INDICES) n_epochs = EPOCHS p = DROPOUT report_every = 1 # Set up and initialize model model = WordModel(pretrained_embeds, 100, len(word_to_idx), n_classes, p) loss_function = NLLLoss() optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.6) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) # Training loop for e in range(n_epochs + 1): total_loss = 0 for sent in data["train"]: # (1) Set gradient to zero for new example: Set gradients to zero before pass model.zero_grad() # (2) Encode sentence and tag sequence as sequences of indices