def output_mask(passage, answers): # returns a vector -- same length as the passage -- with 1 if the token is part of an answer, otherwise 0 answer_marker = "$$answer$" # hack !! !!! ! ! yikes ! ! ! ! ! for answer in answers: replacement = " ".join([answer_marker + w for w in tokenize(answer)]) passage = passage.replace(answer, replacement) return np.array([(1 if token.startswith(answer_marker) else 0) for token in tokenize(passage)])
def fetch_abstracts(): # First we need the Id and year of each document. results = db.query("SELECT ID, year FROM Document ORDER BY year;") rows = [] for result in results: did = result[0] # Document ID. y = result[1] # Document Year. fobj = open(base_path + "nips" + year_to_str(y) + "/" + get_str_id(did) + ".txt", "r") with fobj as fo: abstract = get_abstract(fo.read()) if abstract: text = remove_punctuation(abstract.lower()) tokens = tokenize(text) cls_abs = remove_stopwords(tokens) rows.append((did, y, " ".join(cls_abs))) db.insert_into_mysql( 'Abstract', abs_columns, rows)
def make_ipsum(): try: url = request.args.get('url') para_size = request.args.get('para-size') number_para = request.args.get('number-para') print('pra', para_size) print('num', number_para) result = { "url": url, } # return 'Query string example' url = url.strip() print('url', url) # SiteSpider(CrawlSpider, url) copy = get_site_markup(url) clean = clean_copy(copy) tokenized = tokenize(clean) matches = match_tokens(tokenized, tags_danish) ipsum = ' '.join(matches) para_ipsum = make_para(ipsum, para_size, number_para) result["ipsum"] = para_ipsum return jsonify(data=result), 200 except: return jsonify( error={"message": 'Ooof some of these fancy site block our bot' }), 403
def add_doc(self, doc, flag): """ Adds doc and doc components to class data. -1 -> unknown 0 -> negative 1 -> positive """ self.docs.append((doc, flag)) # add the doc's words tokens = tokenize(doc) # update the vocab self.vocab.update(tokens) if flag != -1: self.label_count[flag] += len(tokens) # get the number of times that this word appears in the doc word_counts = {} for word in tokens: if word not in word_counts: word_counts[word] = 0 word_counts[word] += 1 # increment all of the word counts for word in tokens: if word not in self.words: self.words[word] = {} self.words[word][len(self.docs)-1] = word_counts[word]
def topic_extraction(collection, max_topics=100): """ :collection -> MongoDB collection obtained with find() or list of documents :max_topics -> max number of topics to analyse K-means performance for """ corpus = [] for tweet in collection: corpus.append(tokenize(parse_tweet(tweet))) tfidf = TfidfVectorizer( # parameters can be changed min_df=5, max_df=0.95, max_features=8000, ) tfidf.fit(corpus) text = tfidf.transform(corpus) labels = tfidf.get_feature_names() K = find_optimal_size(text, max_topics) clusters = MiniBatchKMeans(n_clusters=K, init_size=1024, batch_size=2048, random_state=2211).fit_predict(text) df = pd.DataFrame(text.todense()).groupby(clusters).mean() top = [] for i, r in df.iterrows(): top_words = ', '.join([labels[t] for t in np.argsort(r)[-10:]]) top.append("Cluster {}: {}".format(i, top_words)) return top
def _predict_doc(self, x, flag): """ Get probability of x being positive/negative """ if flag == 1: denom = self.X.num_positive() else: denom = self.X.num_negative() denom += self.X.vocab_size() # multiply word probabilities for all words in x words = tokenize(x) # prob = 1.0 # for word in words: # wi = self._doc_count_for_word(word, flag=flag) # # utilize the Laplace Smooth # prob *= ((float(wi)+1.0) / (float(denom)+2.0)) prob = math.log(self.X.priors[str(flag)]) for word in words: wi = self._doc_count_for_word(word, flag=flag) # utilize the Laplace Smooth prob += math.log((float(wi) + 1.0) / (float(denom) + 2.0)) # prob *= math.log(self.X.priors[str(flag)]) return prob
def map_line_to_seq(line, inverse_voc): """ Converts a string(sentence) to a sequence of integers Will also tokenize the sentence """ return [ inverse_voc[w] if w in inverse_voc else inverse_voc['<below_th>'] for w in tokenize(line) ]
def vectorize(text, fixed_length=None): vocab_size = len(vocab_lookup) tokens = tokenize(text) if fixed_length is not None: tokens = (tokens + [0] * max(0, fixed_length - len(tokens)))[:fixed_length] return np.array( [vocab_lookup.get(token, vocab_lookup[UNKNOWN]) for token in tokens])
def send_request(self, text: str, olang: str, odomain: str) -> tuple: """Send prepared batch for translation. Endpoint receives msg = { "src": "hello", "conf": "ger,fml" } transferred in bytes via socket communication Args: text: text to translate olang: output language odomain: output domain Returns: Tuple containing response with the translation or an error. Type of first element is rather str or bool respectively. """ msg = {"src": text.strip('|'), "conf": "{},{}".format(olang, odomain)} jmsg = bytes(json.dumps(msg), 'ascii') if self.connected: with self.lock: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: sock.connect((self.host, self.port)) sock.sendall(b'HI') preresponse = sock.recv(5) assert preresponse == b'okay' sock.sendall(bytes("msize:" + str(len(jmsg) + 13), 'ascii')) politeness = sock.recv(11) assert politeness == b'still okay' sock.sendall(jmsg) rawresponse = sock.recv(2048) if rawresponse.startswith(b"msize:"): in_msg_size = int(rawresponse.strip().split(b":")[1]) sock.sendall(b'OK') rawresponse = sock.recv(in_msg_size + 13) try: response = json.loads(rawresponse) except json.decoder.JSONDecodeError as e: app.logger.debug('Received broken json', e) app.logger.debug(rawresponse) return False, f'Can not decode server raw response : {response}' try: translation = response['final_trans'] except KeyError: app.logger.debug('Response does not contain translation') app.logger.debug(response) return False, f'Server response: {response}' responses = tokenize(translation) return responses except Exception: return False, traceback.format_exc() finally: sock.close() # return tuple?
def extract_top_entities(collection, condition={"$exists": True}): """ Extracts number of mentions, retweets, hashtags for a collection As well as top 50 words :collection -> MongoDB collection obtained with find() or JSON Tweets """ mentions_count = {} retweets_count = {} hashtags_count = {} corpus = [] for tweet in collection.find({"sentiment": condition}): corpus += tokenize(parse_tweet(tweet)).split(" ") tweeter = tweet["user"] # RETWEETS if tweet.get("retweeted_status"): rt_user = tweet["retweeted_status"]["user"]["screen_name"] if not retweets_count.get(rt_user): retweets_count[rt_user] = tweet["retweeted_status"]["retweet_count"] # get how many times the tweet has already been retweeted else: retweets_count[rt_user] += 1 # we have already seen this retweet: tally up another RT for this specific RT if tweet.get("truncated"): # if tweet is truncated we need to look through the extended tweet for entities tweet = tweet["extended_tweet"] # USER MENTIONS if tweet["entities"].get("user_mentions"): for user in tweet["entities"]["user_mentions"] + [tweeter]: user = user["screen_name"] if not mentions_count.get(user): mentions_count[user] = 1 else: mentions_count[user] += 1 # HASHTAGS tweet = get_body(tweet) if tweet["entities"].get("hashtags"): for h in tweet["entities"]["hashtags"]: hl = h["text"].lower() if not hashtags_count.get(hl): hashtags_count[hl] = 1 else: hashtags_count[hl] += 1 fdist = FreqDist(corpus) top_50 = fdist.most_common(50) return mentions_count, retweets_count, hashtags_count, top_50
def view_word_correlations(training_data, prop_words=0.25, n=200): """ View the most commonly co-occurring words """ import pandas as pd from helpers import tokenize # get the most commonly occurring prop_words % of words word_counts = {} for word in training_data.words: word_counts[word] = len(training_data.words[word]) swc = sorted(word_counts.items(), key=lambda x: x[1], reverse=True) most_common_words = set() for i in range(len(swc)): if i >= (len(training_data.words) * prop_words): break most_common_words.add(swc[i][0]) # map words to lists of word counts d = {} for word in most_common_words: d[word] = [] for doc in training_data.docs: tokens = tokenize(doc[0]) d[word].append(len([t for t in tokens if t == word])) df = pd.DataFrame(data=d) def get_redundant_pairs(df): '''Get diagonal and lower triangular pairs of correlation matrix''' pairs_to_drop = set() cols = df.columns for i in range(0, df.shape[1]): for j in range(0, i + 1): pairs_to_drop.add((cols[i], cols[j])) return pairs_to_drop def get_top_abs_correlations(df, n): au_corr = df.corr().abs().unstack() labels_to_drop = get_redundant_pairs(df) au_corr = au_corr.drop(labels=labels_to_drop).sort_values( ascending=False) return au_corr[0:n] print get_top_abs_correlations(df, n=n)
def sentiment_analysis(collection): """ :collection -> MongoDB collection obtained with find() or list of documents """ scores = [] ids = [] for tweet in collection: text = parse_tweet(tweet) scores.append(find_sentiment_tb(tokenize(text))) ids.append(tweet['_id']) assert len(scores) == len(ids) scores = np.array(scores) ids = np.array(ids) return scores, ids
def generate_heatmap(net, para, question): vectors = [vectors_from_question(p, q) for p, q in [(para, question)]] questions = np.array([q for ((p, q), mask) in vectors]) passages = np.array([p for ((p, q), mask) in vectors]) mask = net.session.run(net.output, { net.dropout: 1, net.question: questions, net.passage: passages })[0] top_n = sorted(range(len(mask)), key=lambda i: mask[i], reverse=True)[:10] mask = [(1 if i in top_n else 0) for i in range(len(mask))] tokens = tokenize(para.passage) heatmap = u" ".join([ u"<span style='background-color: rgba(255,0,0,{0})'>{1}</span>".format( max(0, min(1, value)), word) for value, word in zip(mask, tokens) ]) html = u"<h1>{0}</h1> <p>{1}</p>".format(question.question, heatmap) return html
def _get_clean_html(self,token=True): cleaned = nltk.clean_html(self.text) # if self.debug: # print cleaned normalized = normalize(cleaned) # if self.debug: # print normalized if token: tok_text = tokenize(normalized) # if self.debug: # print tok_text return tok_text else: return normalized
def fetch_collocations(): # First we need fectch all the abstracts for each year. for year in years: rows = [] results = db.query( "SELECT ID, abstract FROM Abstract WHERE year = 20" + year) year_abstracts = "" for abstract in results: year_abstracts += " " + abstract[1] top_colls = get_collocations(tokenize(year_abstracts), 100) for r in results: colls_to_insert = [c[0] + " " + c[1] for c in top_colls if c[0] + " " + c[1] in r[1]] for col in colls_to_insert: rows.append((r[0], 2000 + int(year), col)) db.insert_into_mysql( 'Collocation', coll_columns, rows)
def fetch_collocations(): # First we need fectch all the abstracts for each year. for year in years: rows = [] results = db.query( "SELECT ID, abstract FROM Abstract WHERE year = 20" + year) year_abstracts = "" for abstract in results: year_abstracts += " " + abstract[1] top_colls = get_collocations(tokenize(year_abstracts), 100) for r in results: colls_to_insert = [ c[0] + " " + c[1] for c in top_colls if c[0] + " " + c[1] in r[1] ] for col in colls_to_insert: rows.append((r[0], 2000 + int(year), col)) db.insert_into_mysql('Collocation', coll_columns, rows)
def fetch_abstracts(): # First we need the Id and year of each document. results = db.query("SELECT ID, year FROM Document ORDER BY year;") rows = [] for result in results: did = result[0] # Document ID. y = result[1] # Document Year. fobj = open( base_path + "nips" + year_to_str(y) + "/" + get_str_id(did) + ".txt", "r") with fobj as fo: abstract = get_abstract(fo.read()) if abstract: text = remove_punctuation(abstract.lower()) tokens = tokenize(text) cls_abs = remove_stopwords(tokens) rows.append((did, y, " ".join(cls_abs))) db.insert_into_mysql('Abstract', abs_columns, rows)
def readfile(self): with open(self.path) as fp: extra = [] iterator = FileMacroIterator(fp) for line in iterator: line = line.rstrip() if line in ManpageParser.forbidden_lines: raise NotSupportedFormat(self.path) # Fix buggy lines line = ManpageParser.line_replacement.get(line, line) for k, v in ManpageParser.str_replacement: line = line.replace(k, v) if Line.comment in line: # Line has comment line = line.split(Line.comment, 1)[0] if line and len(line) > 2: if line[-1] == "\\": if line[-2] != "\\" and line[-2] != "{": extra.append(line[:-1]) continue elif line[-2:] == "\\c": extra.append(line[:-2]) continue if extra: extra.append(line) line = ' '.join(extra) extra = [] if line == Line.cc or line == Line.c2 or line == '\'.': # Empty line (cc or c2) continue if not line: # Empty line self.lines.append(('', '')) continue if line[0] in {Line.cc, Line.c2}: chunks = line[1:].lstrip().split(None, 1) if not chunks: # Very special case lvm2create_initrd.8 continue macro = chunks[0] if macro == '"': # Bug in run.1 continue if macro == 'b': # Bug in devlink-sb.8 macro = 'B' if macro in self.custom_macros.macros: iterator.add_lines(self.custom_macros.macros[macro]) continue if len(chunks) == 2: rest = chunks[1] else: rest = "" if macro == 'so': raise RedirectedPage(self.path, rest) if line.startswith(".el\\{\\"): # There is a lot of crap in pages (isag.1, for instance) macro = "el" rest = "\\{\\" + rest if macro in Macro.conditional: # FIXME: This needs reworking braces = 0 if "\\{" in rest: braces += 1 if "\\}" in rest: braces -= 1 while braces: macro_line = next(iterator) if "\\{" in macro_line: braces += 1 if "\\}" in macro_line: braces -= 1 if not braces: break continue if self.parser is None: if macro == "TH": self.parser = ManpageParser.process_man7 elif macro == "Dd": self.parser = False if macro == 'ig': while True: macro_line = next(iterator) if macro_line.rstrip().startswith(".."): break continue if macro in {'de', 'de1'}: self.custom_macros.add_macro(rest.strip()) while True: macro_line = next(iterator) if macro_line.rstrip().startswith(".."): break else: self.custom_macros.add_line(macro_line) continue if macro in Macro.ignore: continue # Macro start if macro == 'if': # FIXME continue if macro in Macro.vertical_spacing: self.lines.append(('', '')) else: self.lines.append((macro, tokenize(entitize(rest)))) else: self.lines.append(('', entitize(line)))
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--split', type=str, default='train', help= 'Specify which part of the dataset you want to dump to text. Your options are: train, val, test, test-dev' ) parser.add_argument( '--answers', type=str, default='modal', help= 'Specify if you want to dump just the most frequent answer for each questions (modal), or all the answers (all)' ) args = parser.parse_args() #nlp = English() #used for conting number of tokens data_dir = '/fs/project/PAS1315/VQA/Annotations/' data_dir1 = '/fs/project/PAS1315/VQA/Questions/' if args.split == 'train': annFile = data_dir + 'v2_mscoco_train2014_annotations.json' quesFile = data_dir1 + 'v2_OpenEnded_mscoco_train2014_questions.json' questions_file = 'data/preprocessed/questions_train2014.txt' questions_id_file = 'data/preprocessed/questions_id_train2014.txt' questions_lengths_file = 'data/preprocessed/questions_lengths_train2014.txt' if args.answers == 'modal': answers_file = 'data/preprocessed/answers_train2014_modal.txt' elif args.answers == 'all': answers_file = 'data/preprocessed/answers_train2014_all.txt' coco_image_id = 'data/preprocessed/images_train2014.txt' coco_image_path = 'data/preprocessed/images_train2014_path.txt' data_split = 'training data' subtype = 'train2014' elif args.split == 'val': annFile = data_dir + 'v2_mscoco_val2014_annotations.json' quesFile = data_dir1 + 'v2_OpenEnded_mscoco_val2014_questions.json' questions_file = 'data/preprocessed/questions_val2014.txt' questions_id_file = 'data/preprocessed/questions_id_val2014.txt' questions_lengths_file = 'data/preprocessed/questions_lengths_val2014.txt' if args.answers == 'modal': answers_file = 'data/preprocessed/answers_val2014_modal.txt' elif args.answers == 'all': answers_file = 'data/preprocessed/answers_val2014_all.txt' coco_image_id = 'data/preprocessed/images_val2014_all.txt' coco_image_path = 'data/preprocessed/images_val2014_path.txt' data_split = 'validation data' subtype = 'val2014' elif args.split == 'test-dev': quesFile = data_dir1 + 'v2_OpenEnded_mscoco_test-dev2015_questions.json' questions_file = 'data/preprocessed/questions_test-dev2015.txt' questions_id_file = 'data/preprocessed/questions_id_test-dev2015.txt' questions_lengths_file = 'data/preprocessed/questions_lengths_test-dev2015.txt' coco_image_id = 'data/preprocessed/images_test-dev2015.txt' coco_image_path = 'data/preprocessed/images_test-dev2015_path.txt' data_split = 'test-dev data' subtype = 'test-dev2015' elif args.split == 'test': quesFile = data_dir1 + 'v2_OpenEnded_mscoco_test2015_questions.json' questions_file = 'data/preprocessed/questions_test2015.txt' questions_id_file = 'data/preprocessed/questions_id_test2015.txt' questions_lengths_file = 'data/preprocessed/questions_lengths_test2015.txt' coco_image_id = 'data/preprocessed/images_test2015.txt' coco_image_path = 'data/preprocessed/images_test2015_path.txt' data_split = 'test data' subtype = 'test2015' else: raise RuntimeError( 'Incorrect split. Your choices are:\ntrain\nval\ntest-dev\ntest') #initialize VQA api for QA annotations #vqa=VQA(annFile, quesFile) questions = json.load(open(quesFile, 'r')) ques = questions['questions'] if args.split == 'train' or args.split == 'val': qa = json.load(open(annFile, 'r')) qa = qa['annotations'] #pbar = progressbar.ProgressBar() print( 'Dumping questions, answers, questionIDs, imageIDs, and questions lengths to text files...' ) imdir = '%s/COCO_%s_%012d.jpg' N = len(ques) print('') print('{} Writing {} questions file {}'.format('*' * 10, args.split, '*' * 10)) with open(questions_file, 'w') as f: for i, q in zip(range(N), ques): f.write((q['question'] + '\n').encode('utf8')) print('{}/{} written.'.format(i, N), end='\r') sys.stdout.flush() print('{} Done writing {} questions file {}'.format( '*' * 10, args.split, '*' * 10)) print('') print('{} Writing {} questions lengths file {}'.format( '*' * 10, args.split, '*' * 10)) with open(questions_lengths_file, 'w') as f: for i, q in zip(range(N), ques): f.write((str(len(tokenize(q['question']))) + '\n').encode('utf8')) print('{}/{} written.'.format(i, N), end='\r') sys.stdout.flush() print('{} Done writing {} questions length file {}'.format( '*' * 10, args.split, '*' * 10)) print('') print('{} Writing {} questions id file {}'.format('*' * 10, args.split, '*' * 10)) with open(questions_id_file, 'w') as f: for i, q in zip(range(N), ques): f.write((str(q['question_id']) + '\n').encode('utf8')) print('{}/{} written.'.format(i, N), end='\r') sys.stdout.flush() print('{} Done writing {} questions id file {}'.format( '*' * 10, args.split, '*' * 10)) print('') print('{} Writing {} coco_image id file {}'.format('*' * 10, args.split, '*' * 10)) with open(coco_image_id, 'w') as f: for i, q in zip(range(N), ques): f.write((str(q['image_id']) + '\n').encode('utf8')) print('{}/{} written.'.format(i, N), end='\r') sys.stdout.flush() print('{} Done writing {} coco_image id file {}'.format( '*' * 10, args.split, '*' * 10)) print('') print('{} Writing {} coco_image_path file {}'.format( '*' * 10, args.split, '*' * 10)) with open(coco_image_path, 'w') as f: for i, q in zip(range(N), ques): image_path = imdir % (subtype, subtype, int(q['image_id'])) f.write((image_path + '\n').encode('utf8')) print('{}/{} written.'.format(i, N), end='\r') sys.stdout.flush() print('{} Done writing {} coco_image_path file {}'.format( '*' * 10, args.split, '*' * 10)) print('') print('{} Writing {} answers file {}'.format('*' * 10, args.split, '*' * 10)) with open(answers_file, 'w') as f: for i, q in zip(range(N), ques): if args.answers == 'modal': f.write(getModalAnswer(qa[i]['answers']).encode('utf8')) elif args.answers == 'all': f.write(getAllAnswer(qa[i]['answers']).encode('utf8')) f.write('\n'.encode('utf8')) print('{} Done writing {} answers file {}'.format('*' * 10, args.split, '*' * 10)) print('') print('completed dumping {}'.format(data_split))
notfound = 0 regexnotfound = 0 regexincorrect = 0 regexhits = 0 try: kattismatrix(a, hmm.stdin) kattismatrix(b, hmm.stdin) kattismatrix(q, hmm.stdin) run = False for person in people: run = not run if run: continue text = person["description_en"] tokens = helpers.tokenize(text) hmm.stdin.write(str(len(tokens)) + " " + " ".join([str(toState(token, wordlist)) for token in tokens]) + "\n") result = hmm.stdout.readline() result = [int(word) for word in result.split(" ")] values = [helpers.extract(x, type) for x in person[property].split(";")] weguessed = False correct = False # The regex guess r = pattern.search(person["description_en"]) regexguess = None if r: regexguess = r.group(0) if regexguess in values: regexhits += 1 else:
for img in tweet['medias_files'].split('|'): retweets[img] = count # Creating output folder os.makedirs(output_folder_path, exist_ok=True) # Method n°1 - tfidf token selection # NOTE: method n°1 is inconclusive dfs = Counter() for item in metadata: captions = item['captions'] for caption in captions: text = caption['caption'] tokens = tokenize(text) for token in tokens: dfs[token] += 1 # for item in metadata: # captions = item['captions'] # for caption in captions: # text = caption['caption'] # tokens = tokenize(text) # best_token = max(tokens, key=lambda token: math.log(1 / dfs[token])) # print('%s -> best token is: %s' % (colored(text, 'cyan'), colored(best_token, 'red'))) # Method n°2 - prefix clustering
output.append(array[i:i + bud_size, j:j + bud_size]) return output cam = cv2.imread('input/waterfall1_5.png') #data.coffee() cam = cv2.cvtColor(cam, cv2.COLOR_BGR2RGB) #cam = data.coffee() colorList = np.unique(cam.reshape(-1, cam.shape[2]), axis=0) print("colors:", colorList) color_codebook = dict() color_reverse_codebook = dict() for i, color in enumerate(colorList): color_codebook[i] = color color_reverse_codebook[tokenize(color)] = i reduced_cam = np.zeros((cam.shape[0], cam.shape[1])) for i in range(cam.shape[0]): for j in range(cam.shape[1]): reduced_cam[i][j] = color_reverse_codebook[tokenize(cam[i][j])] #print(reduced_cam) a, b, c = cam.shape offset = 1 newCam = np.zeros((a + 2 * offset, b + 2 * offset, c)) for i in range(cam.shape[0]): for j in range(cam.shape[1]): if abs(cam.shape[0] -
else: addWord('1', countmap, globalcountmap, count) backtoback = 0 backtoprefix = 0 targettotarget = 0 targettopost = 0 run = True for person in people: run = not run if run: continue things = person[property].split(";") things = map(lambda x: helpers.extract(x, type), things) things = [helpers.tokenize(thing) for thing in list(set(things))] thingsFound = [0 for thing in things] text = helpers.tokenize(person["description_en"]) prevPostEnd = 0 currWord = 0 while currWord < len(text): for i, thing in enumerate(things): if thing[thingsFound[i]] != text[currWord]: thingsFound[i] = 0 continue thingsFound[i] += 1 if thingsFound[i] < len(thing): continue start = currWord - len(thing) + 1
self.qas = [QA(qa) for qa in data['qas']] class QA(object): def __init__(self, data): self.question = data['question'] self.answers = data[ 'answers'] # array of dictionaries, with keys `answer_start` (a character index) and `text` def test(): return Dataset('data/dev-v1.1.json') def train(): return Dataset('data/train-v1.1.json') if __name__ == '__main__': t = train() passage_length_distribution = [ len(tokenize(p.passage)) for p in t.paragraphs ] question_length_distribution = [ len(tokenize(q.question)) for para in t.paragraphs for q in para.qas ] print "Passage length distribution (tokens):", print_distribution( passage_length_distribution) print "Question length distribution (tokens):", print_distribution( question_length_distribution)
del FLAGS.from_json config = vars(FLAGS) if not os.path.isdir(FLAGS.model_name): os.mkdir(FLAGS.model_name) with open(os.path.join(FLAGS.model_name, 'config.json'), 'w') as f: json.dump(config, f) data = pd.read_csv(config['file'])[['question1', 'question2', 'is_duplicate']].astype(str) print('data loaded') cell = LSTMCell if config['cell'] == 'lstm' else GRUCell N = data.shape[0] inds = np.random.permutation(N) if config['cutoff_type'] == 'count': q1, q2, vocab_size, words_inds = helpers.tokenize(data, cutoff_count=config['cutoff_count']) else: q1, q2, vocab_size, words_inds = helpers.tokenize(data, cutoff_number=config['cutoff_nr']) split = int(N*config['cv_ratio']) with open(os.path.join(config['model_name'], 'words.json'), 'w') as f: json.dump(words_inds, f) train = helpers.pair_iterator(q1[inds[:split]], q2[inds[:split]], data.ix[inds[:split], 'is_duplicate'].astype(int), batch=config['batch']) test = helpers.pair_iterator(q1[inds[split:]], q2[inds[split:]], data.ix[inds[split:], 'is_duplicate'].astype(int), batch=config['batch']*4) print('iterators created') model = siamese.siamese(hidden_units=config['hidden'], embedding_size=config['embed'], vocab_size=vocab_size, cell=cell, bidirectional=config['bidirectional'], clipping='none')
def _get_title_and_desc(self,token=True): try: soup = BeautifulSoup(self.text, convertEntities=BeautifulSoup.HTML_ENTITIES) except: t,v,tb = sys.exc_info() l = traceback.format_exception(t, v,tb) if self.debug: print "".join(l) del t del v del tb return { "title": [], "description": [] } title ="" try: title = smart_str(soup.title.text) if self.debug: print "\n\ntitle :" print title except: t,v,tb = sys.exc_info() l = traceback.format_exception(t, v,tb) if self.debug: print "".join(l) del t del v del tb pass desc="" try: d = pq(self.text) desc = d('meta').filter("[name=description]").attr('content') if self.debug: print "\n\ndescription :" print desc except: t,v,tb = sys.exc_info() l = traceback.format_exception(t, v,tb) if self.debug: print "".join(l) del t del v del tb pass if token: tok_title = [] tok_desc = [] if title and len(title): if self.debug: print 'dans le if title' tok_title = tokenize(normalize(title)) if self.debug: print str(type(tok_title)) print tok_title else: if self.debug: print 'dans le else title T_T' pass if desc and len(desc): if self.debug: print 'dans le if desc' tok_desc = tokenize(normalize(desc)) if self.debug: print str(type(tok_desc)) print tok_desc else: if self.debug: print 'dans le else desc T_T' pass if self.debug: print tok_title print tok_desc tok_title.extend(tok_desc) if self.debug: print "retour token title desc : \n" + str(type(tok_title)) return tok_title else: return { "title": title, "description": desc }
from argparser import parser from generator import Generator from helpers import tokenize, load_text if __name__ == '__main__': generator = Generator() args = parser.parse_args() if args.action == 'fit': if not args.source: print('You must specify input text source to fit the model.') text = load_text(args.source) generator.fit(tokenize(text)) generator.save(args.file) elif args.action == 'generate': try: generator.load(args.file) except FileNotFoundError: print('Data file does not exist.') exit(0) completed, sentence = generator.generate(args.n) if not completed: print('The sentence is incomplete') print(sentence) else: print('Please specify a correct action - either "fit" or "generate". ' 'Type -h for more info.')