def semantics(reply, stmt=None, **kwargs): global nlp nlp = kwargs.get('nlp', nlp) if kwargs is None or kwargs['nlp'] is None or not stmt: return 0.0 cos_sim = nlp(reply).similarity(nlp(stmt)) return cos_sim
def generate_sentence(spec=SENTENCE_SPEC, sentence_id=None): """ Generate random sentence using word probabilities specified in SENTENCE_SPEC >>> spec = { ... "answers":[[{"HDL":0.95,"good_cholesterol":0.05}, {"150": 0.01,"145": 0.01,"unk": 0.98}], ... "sentences":["Patient LDL level is 100, ________ level is 50, and the total is ______ .",] ... } >>> s = generate_sentence(spec=spec, sentence_id=0) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE >>> s 'Patient LDL level is 100, ... level is 50, and the total is ... .' >>> s[26:42] in ('HDL level is 50,', 'good_cholesterol') True >>> s[60:63] in ('150', '145', 'unk') True """ sentences = spec['sentences'] if sentence_id is None: sentence_id = np.random.randint(0, len(sentences)) sentence = sentences[sentence_id] answer = spec['answers'][sentence_id] i_unk = 0 tokens = [] for i, tok in enumerate(nlp(sentence)): if re.match(r'^(_+|unk|\[MASK\])$', tok.text): possible_tokens, p = list(zip(*answer[i_unk].items())) tokens.append(np.random.choice(a=possible_tokens, p=p)) i_unk += 1
def get_sentences(df, size_limit=50000, vector_dim=None): vector_dim = len( nlp('word').vector) if vector_dim is None else int(vector_dim) sents = [] docvecs = np.zeros((len(df), vector_dim)) encodings = [] for file_id, row in tqdm(df.iterrows(), total=len(df)): sentvecs = [] encodings.append('utf8') if row['size'] <= size_limit and row['is_journal']: try: with open(row['path'], 'rb') as fin: bintext = fin.read() try: text = bintext.decode() except UnicodeDecodeError: encodings[-1] = 'latin' log.warning(f"LATIN?: {row['path']}") text = bintext.decode('latin') doc = nlp(text) except UnicodeDecodeError: log.error(f"UnicodeDecodeError: {row['path']}") continue docvecs[file_id, :] = np.array(list(doc.vector)) docsents = [ dict(sentence_pos=f'{file_id}-{j}', file_id=file_id, text=s.text) for j, s in enumerate(doc.sents) ] log.info(f"Read {len(docsents)} sentences: {row['path']}") # print(doc.vector) sents.extend(docsents) sentvecs.extend([s.vector for s in doc.sents]) else: log.warn(f"skipped {row['path']}") df['encoding'] = encodings df = pd.concat([df, pd.DataFrame(np.array(docvecs))], axis=1) df_sents = pd.DataFrame(sents, index=list(range(len(sents)))) df_sents = pd.concat([df_sents, pd.DataFrame(np.array(sentvecs))], axis=1) df_sents.index.name = 'sentence_id' return df, df_sents
def semantics(reply, stmt=None, **kwargs): """ Compute word2vec docvec cosine similarity (fall back to character IOU) >>> semantics('Hello world!', 'Goodbye big earth!') > .5 True """ global nlp nlp = kwargs.get('nlp', nlp) if kwargs is None or nlp is None or not stmt or not reply: return 0.0 reply_doc, stmt_doc = nlp(reply), nlp(stmt) if not reply_doc or not stmt_doc or not reply_doc.has_vector or not stmt_doc.has_vector: # FIXME: levenshtien would be better or fuzzywuzzy return iou(reply, stmt) cos_sim = nlp(reply).similarity(nlp(stmt)) log.debug(f'cos_sim={cos_sim}') return cos_sim
def term_vector_dict(terms, keys=None): terms = [str(t) if t else '' for t in terms] keys = terms if keys is None else list(keys) vector_list = [] log.info(f'Computing doc vectors for {len(terms)} terms...') for k, term in zip(keys, terms): vec = nlp(term).vector # s can sometimes (rarely) be a float because of pd.read_csv (df_titles) vec /= np.linalg.norm(vec) or 1. # vec = vec.round(7) mask_zeros = np.abs(vec) > 0 if mask_zeros.sum() < len(mask_zeros): log.warning(f'BAD VEC: {term} [0]*{mask_zeros.sum()}') vector_list.append((k, vec)) # columns = [f'x{i}' for i in range(300)'' # dtypes = {c: pd.np.float16 for c in columns} # df_vectors # dtypes.update(page_title=str) # self.df_vectors = pd.read_csv(filepath, dtype=dtypes) return dict(vector_list)
def search_csv( csv_path='/midata/private/journal/files.csv', query='Misima island port harbor derelict ship PNG Papua New Guinnea Australis harbor storm sailing cliffs anchor drag', num_results=10, num_dims=300): df = pd.read_csv(csv_path, index_col=0) index_path = os.path.join(os.path.dirname(csv_path), 'files_index.ann') index = AnnoyIndex(f=num_dims) index.load(index_path) vec = nlp(query).vector paths = [] for i in index.get_nns_by_vector(vec, num_results): path = df.iloc[i]['path'] paths.append(path) print(path) with open(path, 'rb') as fin: bintext = b''.join(fin.readlines()[:10]) try: text = bintext.decode() except UnicodeDecodeError: text = bintext.decode('latin') print(text) print('-' * 120) return
def scrape_articles(titles=TITLES, exclude_headings=EXCLUDE_HEADINGS, see_also=True, max_articles=10000, max_depth=3): """ Download text for an article and parse into sections and sentences >>> nlp('hello') # to eager-load spacy model hello >>> df = scrape_articles(['ELIZA'], see_also=False) >>> df.shape (87, 3) >>> df.columns Index(['title', 'section', 'sentence'], dtype='object') """ titles = list([titles] if isinstance(titles, str) else titles) exclude_headings = set([eh.lower().strip() for eh in (exclude_headings or [])]) depths = list([0] * len(titles)) title_depths = list(zip(titles, depths)) sentences = [] # FIXME: breadth-first search so you can do a tqdm progress bar for each depth # FIXME: record title tree (see also) so that .2*title1+.3*title2+.5*title3 can be semantically appended to sentences titles_scraped = set(['']) title, d = '', 0 wiki = Wikipedia() for depth in range(max_depth): for i in range(max_articles): title = None while not title or title in titles_scraped: # log.warn(f"Skipping {title} (already scraped)") try: title, d = title_depths.pop() except IndexError: log.warn(f'Out of titles: {title_depths}') break title = title.strip() if d > max_depth or not title: log.info(f"{d} > {max_depth} or title ('{title}') is empty") continue titles_scraped.add(title) page = wiki.article(title) if not (len(page.text) + len(page.summary)): log.error(f"Unable to retrieve {title}") time.sleep(2.17) continue if see_also and d + 1 < max_depth: # .full_text() includes the section heading ("See also"). .text does not section = page.section_by_title('See also') if not section: continue for t in section.text.split('\n')[1:]: if t in page.links: title_depths.append((t, d + 1)) log.debug(f'extended title_depths at depth {d}: {title_depths}') for section in page.sections: if section.title.lower().strip() in exclude_headings: continue # TODO: use pugnlp.to_ascii() or nlpia.to_ascii() text = section.text.replace('’', "'") # spacy doesn't handle "latin" (extended ascii) apostrophes well. # FIXME: need to rejoin short names before colons, like 'ELIZA:' 'Tell me...', and 'Human:' 'What...' # FIXME: need to split on question marks without white space but where next word is capitalized: ...to be unhappy?Though designed strictly... sentences.extend([ (d, title, section.title, s.text) for s in nlp(text).sents if ( len(s.text.strip().strip('"').strip("'").strip()) > 1) ]) log.debug(f'Parsed {len(sentences)} sentences.') # retval = parse_sentences( # title=title, sentences=sentences, title_depths=title_depths, see_also=see_also, # exclude_headings=exclude_headings, d=d, depth=depth, max_depth=max_depth) # if retval is None: # continue # else: # sentences, title_depths = retval log.info(str([depth, d, i, title])) if d > depth: log.info(f"{d} > {depth}") break return pd.DataFrame(sentences, columns='depth title section sentence'.split())