def _convert_item(self, item): """ Convert sentence to list of tokens """ if self.pre_splitted: return item elif self.split: return any2unicode(item).split() else: return self.split_func(any2unicode(item))
def bow_mail_body(txt, nlp): """ args: - txt: raw text - nlp: a spacy engine """ # to unicode & get rid of accent txt = deaccent(any2unicode(txt)) # split according to reply forward (get rid of "entête") txt = "\n".join(re_fw_regex.split(txt)) txt = txt.replace(">", " ") # split sentences sentences = sent_tokenize(txt) # tokenize + lemmatize + filter ? bow = [] for sent in sentences: if REGEX: sent = " ".join(lower_upper_pat.split(sent)) sent = " ".join(number_letter_pat.split(sent)) doc = nlp(sent, parse=False, entity=False) for tok in doc: if (tok.lemma_ and not tok.is_punct and not tok.is_stop and not tok.like_num and not tok.is_space and not tok.like_url and len(tok) > 1 and not any( (x in tok.orth_ for x in not_in_list))): if tok.orth_.startswith("-") or tok.orth_.endswith("-"): bow.append(tok.lemma_.replace("-", "")) else: bow.append(tok.lemma_) return bow
def extract_names(txt, nlp, n_sentences=2): """ Use the spacy entity engine to extract person names from a text args: - txt: raw text - nlp: a spacy engine return: - list of names as strings """ # to unicode & get rid of accent txt = deaccent(any2unicode(txt)) # split according to reply forward (get rid of "entête") txt = "\n".join(re_fw_regex.split(txt)) txt = txt.replace(">", " ") # split sentences sentences = sent_tokenize(txt) # tokenize + lemmatize + filter ? bow = [] for sent in sentences[:n_sentences]: if REGEX: sent = " ".join(lower_upper_pat.split(sent)) sent = " ".join(number_letter_pat.split(sent)) doc = nlp(sent, parse=False) for tok in doc: lemma = drop_digits(replace_punct(tok.lemma_)) if (lemma and (tok.ent_type_ != 'PERSON') and not tok.is_punct and not tok.is_stop and lemma not in extendedstopwords and not tok.like_num and not tok.is_space and not tok.like_url and len(lemma) > 1 and not any( (x in tok.orth_ for x in not_in_list))): bow.append(lemma) return bow
def test_get_offsets_and_start_doctags_win(self): # Each line takes 7 bytes (including '\n' character which is actually '\r\n' on Windows) lines = ['line1\n', 'line2\n', 'line3\n', 'line4\n', 'line5\n'] tmpf = get_tmpfile('gensim_doc2vec.tst') with utils.smart_open(tmpf, 'wb', encoding='utf8') as fout: for line in lines: fout.write(utils.any2unicode(line)) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 1) self.assertEqual(offsets, [0]) self.assertEqual(start_doctags, [0]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 2) self.assertEqual(offsets, [0, 14]) self.assertEqual(start_doctags, [0, 2]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 3) self.assertEqual(offsets, [0, 7, 21]) self.assertEqual(start_doctags, [0, 1, 3]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 4) self.assertEqual(offsets, [0, 7, 14, 21]) self.assertEqual(start_doctags, [0, 1, 2, 3]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 5) self.assertEqual(offsets, [0, 7, 14, 21, 28]) self.assertEqual(start_doctags, [0, 1, 2, 3, 4]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 6) self.assertEqual(offsets, [0, 0, 7, 14, 14, 21]) self.assertEqual(start_doctags, [0, 0, 1, 2, 2, 3])
def test_get_offsets_and_start_doctags(self): # Each line takes 6 bytes (including '\n' character) lines = ['line1\n', 'line2\n', 'line3\n', 'line4\n', 'line5\n'] tmpf = get_tmpfile('gensim_doc2vec.tst') with utils.open(tmpf, 'wb', encoding='utf8') as fout: for line in lines: fout.write(utils.any2unicode(line)) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 1) self.assertEqual(offsets, [0]) self.assertEqual(start_doctags, [0]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 2) self.assertEqual(offsets, [0, 12]) self.assertEqual(start_doctags, [0, 2]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 3) self.assertEqual(offsets, [0, 6, 18]) self.assertEqual(start_doctags, [0, 1, 3]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 4) self.assertEqual(offsets, [0, 6, 12, 18]) self.assertEqual(start_doctags, [0, 1, 2, 3]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 5) self.assertEqual(offsets, [0, 6, 12, 18, 24]) self.assertEqual(start_doctags, [0, 1, 2, 3, 4]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 6) self.assertEqual(offsets, [0, 0, 6, 12, 18, 24]) self.assertEqual(start_doctags, [0, 0, 1, 2, 3, 4])
def test_save_as_line_sentence_ru(self): corpus_file = get_tmpfile('gensim_utils.tst') ref_sentences = [l.split() for l in utils.any2unicode('привет мир\nкак ты поживаешь').split('\n')] utils.save_as_line_sentence(ref_sentences, corpus_file) with utils.open(corpus_file, 'rb', encoding='utf8') as fin: sentences = [line.strip().split() for line in fin.read().strip().split('\n')] self.assertEqual(sentences, ref_sentences)
def test_save_as_line_sentence_ru(self): corpus_file = get_tmpfile('gensim_utils.tst') ref_sentences = [l.split() for l in utils.any2unicode('привет мир\nкак ты поживаешь').split('\n')] utils.save_as_line_sentence(ref_sentences, corpus_file) with utils.smart_open(corpus_file, encoding='utf8') as fin: sentences = [line.strip().split() for line in fin.read().strip().split('\n')] self.assertEqual(sentences, ref_sentences)
def test_save_as_line_sentence_en(self): corpus_file = get_tmpfile('gensim_utils.tst') ref_sentences = [l.split() for l in utils.any2unicode('hello world\nhow are you').split('\n')] utils.save_as_line_sentence(ref_sentences, corpus_file) with utils.open(corpus_file, 'rb', encoding='utf8') as fin: sentences = [line.strip().split() for line in fin.read().strip().split('\n')] self.assertEqual(sentences, ref_sentences)
def test_save_as_line_sentence_en(self): corpus_file = get_tmpfile('gensim_utils.tst') ref_sentences = [l.split() for l in utils.any2unicode('hello world\nhow are you').split('\n')] utils.save_as_line_sentence(ref_sentences, corpus_file) with utils.smart_open(corpus_file, encoding='utf8') as fin: sentences = [line.strip().split() for line in fin.read().strip().split('\n')] self.assertEqual(sentences, ref_sentences)
def get_block_of_tweets(filepath): block_of_tweets = '' with open(filepath) as tweetsfile: tweetreader = csv.reader(tweetsfile) metadata = tweetreader.next() for tweetrow in tweetreader: tweetstring = tweetrow[0] block_of_tweets += tweetstring clean_block_of_tweets = utils.any2unicode(block_of_tweets.replace('\n', ' ').replace('\t', ' '), errors='ignore') text = [word for word in clean_block_of_tweets.lower().split() if word not in stoplist] return text
def __iter__(self): """Iterate through the lines in the source.""" try: # Assume it is a file-like object and try treating it as such # Things that don't have seek will trigger an exception self.source.seek(0) for line in itertools.islice(self.source, self.limit): line = utils.any2unicode(line, errors='replace').split() i = 0 while i < len(line): yield line[i: i + self.max_sentence_length] i += self.max_sentence_length except AttributeError: # If it didn't work like a file, use it as a string filename with utils.smart_open(self.source) as fin: for line in itertools.islice(fin, self.limit): line = utils.any2unicode(line, errors='replace').split() i = 0 while i < len(line): yield line[i: i + self.max_sentence_length] i += self.max_sentence_length
def __iter__(self): """Iterate through the lines in the source. Yields ------ tuple : (list[str], int) Tuple of list of string and index """ with open(self.path, "rb") as f: for i, line in enumerate(f): yield (any2unicode(line).split(), i)
def __iter__(self): """Iterate through the lines in the source. Yields ------ :class:`~fse.inputs.IndexedSentence` IndexedSentence from `path` specified in the constructor. """ with s_open(self.path, "rb") as f: for i, line in enumerate(f): yield IndexedSentence(any2unicode(line).split(), i)
def getSimilarityMatrix(self,sf, recomUrls, userUrls): uu = utils() #create the corpus corpus = [] for iindex,rowi in recomUrls.iterrows(): ur1 = rowi['url'] sf, sm1 = self.querySummary(sf,ur1) sm1 = genu.any2unicode(sm1) sm1 = uu.createTaggedDataForSummary(sm1) corpus.append(sm1) dictionary = corpora.Dictionary(corpus) corpusBow = [dictionary.doc2bow(text) for text in corpus] tfidf = models.TfidfModel(corpusBow) corpus_tfidf = tfidf[corpusBow] #create lsi model lsi = models.LsiModel(corpus_tfidf, id2word=dictionary) # corpus_lsi = lsi[corpus_tfidf] index = similarities.MatrixSimilarity(lsi[corpusBow]) rCorpus = [] for iindex,rowi in userUrls.iterrows(): ur2 = rowi['url'] sf, sm2 = self.querySummary(sf,ur2) sm2 = genu.any2unicode(sm2) sm2 = uu.createTaggedDataForSummary(sm2) rCorpus.append(sm2) # generate results vec_bow = [dictionary.doc2bow(text) for text in rCorpus] vec_lsi = lsi[vec_bow] sims = index[vec_lsi] """ rows are user urls and columns are recommended urls """ return sims
def preprocess_txt(raw_txt): """ Preprocessing of raw txt before parsing with Spacy - deaccent, to unicode - split forward, redirect - replace the > of email reply - split lowerUpper - split letterNumber """ txt = deaccent(any2unicode(raw_txt)) txt = "\n".join(re_fw_regex.split(txt)) txt = txt.replace(">", " ") txt = " ".join(lower_upper_pat.split(txt)) txt = " ".join(number_letter_pat.split(txt)) return txt
def test_cython_linesentence_readline_after_getting_offsets(self): lines = ['line1\n', 'line2\n', 'line3\n', 'line4\n', 'line5\n'] tmpf = get_tmpfile('gensim_doc2vec.tst') with utils.smart_open(tmpf, 'wb', encoding='utf8') as fout: for line in lines: fout.write(utils.any2unicode(line)) from gensim.models.word2vec_corpusfile import CythonLineSentence offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 5) for offset, line in zip(offsets, lines): ls = CythonLineSentence(tmpf, offset) sentence = ls.read_sentence() self.assertEqual(len(sentence), 1) self.assertEqual(sentence[0], utils.any2utf8(line.strip()))
def predict_candidate(blob_of_tweets, k_neighbors, k_threshold): training_set = [] for candidate_handle in candidate_handles: candidate_folder = candidate_supporter_tweets_folders[candidate_handle] dirlist = os.listdir(candidate_folder) dirlist = [file for file in dirlist if file.endswith('.csv')] shuffle(dirlist) for i in range(len(dirlist)): filepath = os.path.join(candidate_folder, dirlist[i]) training_set.append(filepath) tfidf, index, dictionary, id_to_path_dict, corpus = create_tfidf_from_file() clean_block_of_tweets = utils.any2unicode(blob_of_tweets.replace('\n', ' ').replace('\t', ' '), errors='ignore') text = [word for word in clean_block_of_tweets.lower().split() if word not in stoplist] return classify_tfidf_knn(tfidf, dictionary, index, text, k_neighbors, id_to_path_dict, k_threshold)
def process_page(page): """ Preprocess a single periodical page, returning the result as a unicode string. Removes all non-alpha characters from the text. Args: page (str): Passes in the page object Returns: str: Content of the file, but without punctuation and non-alpha characters. """ content = utils.any2unicode(page, 'utf8').strip() content = re.sub(r"[^a-zA-Z]", " ", content) return content
def add_to_gensim_dictionary_and_corpus(dictionary, corpus, id_to_path_dict, path_to_tweets_csv): # corpus should be a list (of lists), and dictionary must be a gensim dictionary block_of_tweets = '' with open(path_to_tweets_csv) as tweetsfile: tweetreader = csv.reader(tweetsfile) try: metadata = tweetreader.next() # this may fall through if the CSV is empty for some reason except: return dictionary, corpus, id_to_path_dict for tweetrow in tweetreader: tweetstring = tweetrow[0] block_of_tweets += tweetstring clean_block_of_tweets = utils.any2unicode(block_of_tweets.replace('\n', ' ').replace('\t', ' '), errors='ignore') text = [word for word in clean_block_of_tweets.lower().split() if word not in stoplist] id_to_path_dict[len(corpus)] = path_to_tweets_csv dictionary.add_documents([text]) corpus.append(dictionary.doc2bow(text)) return dictionary, corpus, id_to_path_dict
def analyize(self,text): try: unitext = any2unicode(text, encoding='utf8', errors='strict') except: print ("Not utf-8") return [] pass #convert to lower lowerText = unitext.lower() # Regex way: gives some text 'qwe (x)' as 'qwe' '(x)' # very aggresive regex...removes puncs and digits..keeps only alphabetic words tokenizer = WhitespaceTokenizer() regexTokens = tokenizer.tokenize(lowerText) p_stemmer = PorterStemmer() stemmedTokens = [p_stemmer.stem(i) for i in regexTokens] stemmedRemSingleLetterTokens = [w for w in stemmedTokens if len(w)>1] return stemmedRemSingleLetterTokens
def _process(self, s): """Takes in a string and returns either a string (no tokenization) or a list of strings (tokenized). """ # TODO: Clarify contract return value for filtered strings # TODO: Add docs for: if a filter evaluates as False the string is discarded # Skip empty strings: if len(s) == 0: # Keep return type consistent if self.tokenizer: return [] else: return '' # Normalize the encoding before anything else if self.normalize_encoding: s = any2unicode(s) # Apply sub, filter, sub for sub in self.pre_substitutions: s = sub(s) # If any of the filters return True, filter the string if any(f(s) for f in self.filters): # Keep return type consistent if self.tokenizer: return [] else: return '' for sub in self.post_substitutions: s = sub(s) # Tokenize last. If you want to process the tokens use another # Transformer if self.tokenizer and s: s = filter(None, self.tokenizer(s)) return s
def __getitem__(self, i): """ Returns the line indexed by i. Primarily used for :meth:`~fse.models.sentencevectors.SentenceVectors.most_similar` Parameters ---------- i : int The line index used to index the file Returns ------- str line at the current index """ if not self.get_able: raise RuntimeError("To index the lines you must contruct with get_able=True") with open(self.path, "rb") as f: f.seek(self.line_offset[i]) output = f.readline() f.seek(0) return any2unicode(output).rstrip()
print "begin" i2e={} index_fixe=[0] pretrainedFile="/media/data/datasets/models/new_arame/"+lang+".vec" if remote else "data/pretrained.txt" if remote: sys.path.insert(0, '/home/arame/hakken-api/models/') import model import utils max_voc=2500 if extract else 'inf' pretrained=model.model(pretrainedFile,max_voc=max_voc,decale=1) for word in words: if any2unicode(word) in pretrained.vocab: i=w2i[word] if False and len(word)>1:# and word[0:2]=="##": index_fixe.append(i) i2e[i]=pretrained.getVector(word) else: import utils wordsModel,floatsModel=utils.loadModel(pretrainedFile) for word,oldi in wordsModel.items(): if word in words: i=w2i[word] if len(word)>1:# and word[0:2]=="#": index_fixe.append(i) i2e[i]=floatsModel[oldi] print "finish load pretrain"
# debug=True) stop_words = get_custom_stop_words() pruned_words, counters, total_words = Phrases.learn_vocab( sentences=LineSentence(unigram_sentences_path), max_vocab_size=800000000, common_terms=stop_words, progress_per=100) counters = sorted(counters.items(), key=lambda key_value: key_value[1], reverse=True) count = 0 for key, value in counters: count += 1 print(any2unicode(key), value) print(count) bigram_model = Phrases(LineSentence(unigram_sentences_path), max_vocab_size=800000000, progress_per=100, threshold=0.5, min_count=100, common_terms=stop_words, scoring='npmi') for sentence in LineSentence(unigram_sentences_path): bigram_sentence = u' '.join(bigram_model[sentence]) print(bigram_sentence + '\n')
def main(): #NOTE I used tr -d to remove ' from the file #contents = TaggedPubMed('big_home_test.txt') contents = TaggedPubMed("age_fix.txt") # ma #vocab_list = ['rs10795668', 'mir-135a', 'lynch_syndrom_i', 'c18.8', 'folfiri-cetuximab', 'rs4939827', 'colon_carcinoma', 'transvers_colon_cancer', 'ctnnb1', 'rs1035209', 'p14', 'anastomosi', 'cowden_syndrom', 'oxaliplatin', 'msi-h', 'dna_imag_cytometri', 'capox', 'endorect_mri', 'aflibercept', 'argon', 'egf', 'rs4925386', 'c18.0', 'angiogenesi_inhibitor', 'cloacogen_carcinoma', 'colon_neoplasm', 'cd29', 'dysplasia_inflammatori_bowel_diseas', 'serrat_polyposi', 'epcam', 'intestin_polyposi', 'rs1800469', 'cd44', 'mir-135b', 'g1n1317', 'rs34612342', 'rectal_cancer', 'ramucirumab', 'interstiti_brachytherapi', 'vegfa', 'tetraploid', 'msi', 'rx', 'fap', 'array-cgh', 'mir-92', 'irinotecan', 't4a-n2a-m0', 'adenomat_polyposi_syndrom', 'colon_cancer', 'radiofrequ_ablat', 'hereditari_nonpolyposi_type_5', 'r2', 'microrna_marker', 'mucos', 'ras-mapk', 'gardner_syndrom', 'neoadjuv_chemo', 'adjuv_chemo', 'doubl_contrast_barium_enema', 'mgmt', 'euploid', 'tingl', 'cyramza', 'monoclon_antibodi', 'c18.4', 'mlh1', 'mir-155', 'c18.6', 'ihc_msi_marker', 'barium_enema', 'hamartomat_polyposi_syndrom', 'msh6', 'd17s250', 'rs12603526', 'hereditari_nonpolyposi', 'pi3k', 'rtk', 'immun_checkpoint_inhibitor', 'pembrolizumab', 'transan_endoscop_microsurgeri', 'colorect_cancer', 'rs10911251', 'polymeras_proofreading-associ_polyposi', 'descend_colon_cancer', 'c18.5', 't4b-n0-m0', 'hepat_arteri_infus', 'molecular_marker_test', 'rs1799977', 'p16', '18q_ai_express', 'stereotact', 'anu_neoplasm', 'cd133', 'colon_kaposi_sarcoma', 'wnt', 'e1317q', 'rs3802842', 'tis-n0-m0', 'splenic_flexur_cancer', 'c18.7', 'turcot_syndrom', 'mir-21', 'rs4779584', 'adenosquam_colon_carcinoma', 'rs11169552', 'rs459552', 'rs3217810', 'rectal_bleed', 'braf_mutat', 't1-n0-m0', 'extern_beam', 'pms2_loss', 'blood_base', 'gardner_syndrom', 'attenu_adenomat_polyposi_coli', 'ptgs2', 't2-n0-m0', 'ploidi_statu', 'genom_instabl', 'bloodi_stool', 'hereditari_nonpolyposi_type_8', 'hereditari_nonpolyposi_type_6', 't1\xe2\x80\x93t2-n1/n1c-m0', 'cea', 'rs3824999', 'colon_lymphoma', 'ulcer_coliti', 'diseas_etiolog', 'g2', 'apoptot', 'ani_t_-ani_n-m1b', 'juvenil_polyposi_syndrom', 'rs1800734', 'microscopi', 'dmmr', 'r0', 'ng', 'desmoid_diseas', 'ctc', 'mir-211', 'rs12241008', 'g13d', 'rs961253', 'ag', 'hereditari_mix_polyposi_syndrom_2', 'dpyd', 'epigenet_gene_silenc', 'f594l', 'constip', 'cologuard', 'hereditari_colon_cancer', 't4b-n1\xe2\x80\x93n2-m0', 'r1', 'thrombocytopenia', 'dmmr_test', 'colon_sarcoma', 'rs174550', 'rectum_cancer', 't1\xe2\x80\x93t2-n2b-m0', 'd2s123', 'rs4444235', 'laparoscopi', 'cin_marker', 'kra_mutat_test', 'snp', 'liver_metastasi', 'prognosi', 'rs1321311', 'ct', 'aneuploid', 'g12v', 'kra', 'rs36053993', 'msi_test', 'hereditari_nonpolyposi_type_4', 'apc', 'timp-1', 'g4', 'p53_express', 'fda_approveddrug', 'g12', 'singl_specimen_guaiac_fobt', 'neuropathi', 'mlh1_loss', 'endocavitari', 'hereditari_nonpolyposi_type_1', 'braf_mutat_test', 'cea_assai', 'colorect_neoplasm', 'polyploidi_test', 'regorafenib', 'g1', 'dna_msi_marker', 'peutz-jegh_syndrom', 'adenomat_polyposi_coli', 'rs10411210', 'epcam', 'colectomi', 'prognost', 'autosom_recess_colorect_adenomat_polyposi', 'hereditari_nonpolyposi_type_3', 'rs158634', 'colon_l-cell_glucagon-lik_peptid_produc_tumor', 'c20', 'metastat_colorect_cancer', 'xeliri', 'hyperplast_polyposi_syndrom', 'bevacizumab', 'rectosigmoid_juction_cancer', 't2\xe2\x80\x93t3-n2a-m0', 'cd24', 'tumor_msi-h_express', 'colorect_adenocarcinoma', 'ani_t-_ani_n-m1a', 'virtual_colonoscopi', 'crohn'_diseas', 'diploid', 't3\xe2\x80\x93t4a-n1/n1c-m0', 'pms2', 'folfiri-bevacizumab', 'rectal_neoplasm', 'braf', 'nrasmut', 'bat25', 'rs1042522', 'cin', 'sigmoid_colon_cancer', 'ascend_colon_cancer', 'radiat_therapi', 'krt20', 'bat26', 'apc_mutat', 'dre', 'colon_leiomysarcoma', 'ra_mutat_test', 'c19', 'lynch_syndrom', 'c18.9', 'tyrosin_kinas_inhibitor', 'ca_19-9', 'hmlh1', 'msh2_loss', 'rs4813802', 'colostomi', 'v600e', 'colon_singlet_ring_adenocarcinoma', 'alter_bowel_habit', 'xelox', 'stabl_diseas', 'rs12309274', 'hereditari_nonpolyposi_type_7', 'lung_metastasi', 'anal_canal_carcinoma', 'fu-lv', 'prognost_biomark', 'colon_small_cell_carcinoma', 'resect', 'rs647161', 'li-fraumeni_syndrom', 'q61k', 'rs10936599', 'rs7758229', 'hepat_flexur_cancer', 'proctectomi', 'msh2', 'dna_mismatch-repair', 'c18.2', 'mrt', 'cryosurgeri', 'pik3ca', 'hereditari_mix_polyposi_syndrom_1', 'oligodontia-colorect_cancer_syndrom', 'sept9_methyl', 'lonsurf', 'colonoscopi', 'adenoma', 'tgf-\xce\xb2', 'g12d', 'rs704017', 'faecal_m2-pk', 'polyploidi_test_result', 'msh6_loss', 'inherit_genet_disord', 'lgr5', 'kra_mutat', 'submucos_invasivecolon_adenocarcinoma', 'r_classif', 'rs9929218', 'sigmoidoscopi', 'mutyh-associ_polyposi', 'vegf', 't3\xe2\x80\x93t4a-n2b-m0', 'nonpolyposi_syndrom', 't1-n2a-m0', 'hyperthermia', 'high_fat_intak', 'popul_base_snp', 'mir-92a', 'cd166', 'anal_gland_neoplasm', 't4a-n0-m0', 'd5s346', 'rs10849432', 'rs61764370', 'rs1801155', 'plod1', 'c18.3', 'optic_colonoscopi', 'mir-31', 'rs16892766', 'rectosigmoid_cancer', 'panitumumab', 't3-n0-m0', 'mir-17', 'gx', 'fish', 'cognit_dysfunct', 'egfr', 'rs1801166', 'acut_myelocyt_leukemia', 'tym', 'folfox', 'lipomat_hemangiopericytoma', 'rs6691170', 'aldh1', 'mutyh', 'mss', 'attenu_famili_adenomat_polyposi', 'colon_adenocarcinoma', 'high_sensit_faecal_occult_blood_test', 'samson_gardner_syndrom', 'colon_mucin_adenocarcinoma', 'pmmr', 'tp53', 'g463v', 'capsul_colonoscopi', 'colon_squamou_cell_carcinoma', 'rectal_irrit', 'c18.1', 'hra', 'ceacam5', 'neodymium:yttrium-aluminum-garnet', 'cetuximab', 'folfiri', 'rs6983267', 'msi-l', 'c18'] #NOTE new list (probably is the same as the old one) vocab_list = [ 'rs10795668', 'mir-135a', 'lynch_syndrom_i', 'biopsi', 'diseas', 'c18.8', 'folfiri-cetuximab', 'rs4939827', 'iiib', 'colon_carcinoma', 'outcom', 'transvers_colon_cancer', 'therapi_resist', 'ctnnb1', 'iiia', 'rs1035209', 'famili_histori', 'relaps_free_surviv', 'p14', 'anastomosi', 'cowden_syndrom', 'oxaliplatin', 'msi-h', 'bleed', 'dna_imag_cytometri', 'capox', 'weight_loss', 'icd', 'endorect_mri', 'aflibercept', 'argon', 'egf', 'immunotherapi', 'physic_activ', 'rs4925386', 'c18.0', 'side_effect', 'diseas_subtyp', 'angiogenesi_inhibitor', 'cloacogen_carcinoma', 'colon_neoplasm', 'cd29', 'dysplasia_in_inflammatori_bowel_diseas', 'serrat_polyposi', 'epcam', 'intestin_polyposi', 'rs1800469', 'cd44', 'mir-135b', 'g1n1317', 'rs34612342', 'symptom', 'rectal_cancer', 'ramucirumab', 'interstiti_brachytherapi', 'vegfa', 'tetraploid', 'msi', 'rx', 'fap', 'array-cgh', 'mir-92', 'irinotecan', 't4a-n2a-m0', 'adenomat_polyposi_syndrom', 'colon_cancer', 'radiofrequ_ablat', 'hereditari_nonpolyposi_type_5', 'r2', 'microrna_marker', 'mucos', 'ras-mapk', 'gardner_syndrom', 'gene', 'neoadjuv_chemo', 'iic', 'adjuv_chemo', 'doubl_contrast_barium_enema', 'mgmt', 'smoke', 'euploid', 'tingl', 'cyramza', 'monoclon_antibodi', 'vomit', 'appetit_loss', 'nausea', 'c18.4', 'mlh1', 'mir-155', 'c18.6', 'ihc_msi_marker', 'barium_enema', 'hamartomat_polyposi_syndrom', 'msh6', 'respons', 'biomark', 'd17s250', 'rs12603526', 'hereditari_nonpolyposi', 'alcohol', 'pi3k', 'rtk', 'nausea', 'blood_disord', 'lack_of_physic_exercis', 'follow-up', 'immun_checkpoint_inhibitor', 'pembrolizumab', 'transan_endoscop_microsurgeri', 'weak', 'colorect_cancer', 'rs10911251', 'polymeras_proofreading-associ_polyposi', 'iib', 'dna_msi_test_result', 'molecular_featur', 'descend_colon_cancer', 'c18.5', 't4b-n0-m0', 'hepat_arteri_infus', 'molecular_marker_test', 'rs1799977', 'predict', 'p16', '18q_ai_express', 'stereotact', 'anu_neoplasm', 'cd133', 'fever', 'ivb', 'good', 'colon_kaposi_sarcoma', 'wnt', 'e1317q', 'rs3802842', 'weak_muscl', 'tis-n0-m0', 'splenic_flexur_cancer', 'chemotherapi', 'target_therapi', 'c18.7', 'turcot_syndrom', 'mir-21', 'rs4779584', 'adenosquam_colon_carcinoma', 'pathwai', 'upset_stomach', 'gender_male', 'rs11169552', 'surviv', 'rs459552', 'rs3217810', 'intern', 'overal_surviv', 'rectal_bleed', 'braf_mutat', 't1-n0-m0', 'extern_beam', 'pms2_loss', 'blood_base', 'gardner_syndrom', 'attenu_adenomat_polyposi_coli', 'ptgs2', 't2-n0-m0', 'ploidi_statu', 'genom_instabl', 'bloodi_stool', 'progress_diseas', 'hereditari_nonpolyposi_type_8', 'nervou_system_effect', 'headach', 'stomach_pain', 'five-year_surviv', 'local_excis', 'type', 'hereditari_nonpolyposi_type_6', 'iii', 't1\xe2\x80\x93t2-n1/n1c-m0', 'therapi', 'hair_loss', 'cea', 'chemotherapi_drug', 'rs3824999', 'colon_lymphoma', 'recurr', 'ulcer_coliti', 'diseas_etiolog', 'g2', 'apoptot', 'iiic', 'ani_t_-ani_n-m1b', '0', 'high_red_meat_diet', 'juvenil_polyposi_syndrom', 'rs1800734', 'microscopi', 'dmmr', 'fit', 'r0', 'mri', 'skin_irrit', 'leukopenia', 'ng', 'system', 'desmoid_diseas', 'pole', 'ctc', 'mir-211', 'iia', 'rs12241008', 'malign', 'g13d', 'rs961253', 'ag', 'hereditari_mix_polyposi_syndrom_2', 'dpyd', 'epigenet_gene_silenc', 'f594l', 'constip', 'cologuard', 'hereditari_colon_cancer', 't4b-n1\xe2\x80\x93n2-m0', 'poor', 'obes', 'partial', 'region', 'r1', 'thrombocytopenia', 'dmmr_test', 'colon_sarcoma', 'rs174550', 'peel', 'rectum_cancer', 't1\xe2\x80\x93t2-n2b-m0', 'd2s123', 'rs4444235', 'laparoscopi', 'cin_marker', 'loss_of_balanc', 'laser_therapi', 'kra_mutat_test', 'snp', 'liver_metastasi', 'prognosi', 'rs1321311', 'ct', 'aneuploid', 'g12v', 'kra', 'rs36053993', 'msi_test', 'hereditari_nonpolyposi_type_4', 'apc', 'timp-1', 'g4', 'p53_express', 'fda_approv_drug', 'g12', 'singl_specimen_guaiac_fobt', 'combin', 'neuropathi', 'mlh1_loss', 'endocavitari', 'fungal_infect', 'hereditari_nonpolyposi_type_1', 'braf_mutat_test', 'anemia', 'cea_assai', 'colorect_neoplasm', 'polyploidi_test', 'regorafenib', 'g1', 'dna_msi_marker', 'peutz-jegh_syndrom', 'adenomat_polyposi_coli', 'rs10411210', 'epcam', 'colectomi', 'prognost', 'autosom_recess_colorect_adenomat_polyposi', 'hereditari_nonpolyposi_type_3', 'rs158634', 'colon_l-cell_glucagon-lik_peptid_produc_tumor', 'c20', 'metastat_colorect_cancer', 'xeliri', 'burn', 'hyperplast_polyposi_syndrom', 'bevacizumab', 'rectosigmoid_juction_cancer', 'european', 't2\xe2\x80\x93t3-n2a-m0', 'carbon_dioxid', 'cd24', 'tumor_msi-h_express', 'colorect_adenocarcinoma', 'ani_t-_ani_n-m1a', 'virtual_colonoscopi', 'crohn'_diseas', 'tender', 'diploid', 't3\xe2\x80\x93t4a-n1/n1c-m0', 'pms2', 'muscl_pain', 'folfiri-bevacizumab', 'rectal_neoplasm', 'predict_biomark', 'braf', 'nra_mutat', 'bat25', 'pet', 'rs1042522', 'complet', 'cin', 'sigmoid_colon_cancer', 'ascend_colon_cancer', 'radiat_therapi', 'krt20', 'mouth_and_throat_sore', 'bat26', 'apc_mutat', 'dre', 'colon_leiomysarcoma', 'fatigu', 'ra_mutat_test', 'c19', 'diagnosi', 'shake', 'lynch_syndrom', 'c18.9', 'tyrosin_kinas_inhibitor', 'risk_factor', 'ca_19-9', 'hmlh1', 'msh2_loss', 'rs4813802', 'colostomi', 'screen', 'v600e', 'colon_singlet_ring_adenocarcinoma', 'alter_bowel_habit', 'xelox', 'iva', 'ii', 'stabl_diseas', 'rs12309274', 'i', 'hereditari_nonpolyposi_type_7', 'lung_metastasi', 'anal_canal_carcinoma', 'fu-lv', 'prognost_biomark', 'colon_small_cell_carcinoma', 'resect', 'rs647161', 'li-fraumeni_syndrom', 'q61k', 'rs10936599', 'sexual_issu', 'rs7758229', 'hepat_flexur_cancer', 'proctectomi', 'clinic_featur', 'msh2', 'dna_mismatch-repair', 'c18.2', 'mrt', 'cryosurgeri', 'pik3ca', 'hereditari_mix_polyposi_syndrom_1', 'oligodontia-colorect_cancer_syndrom', 'sept9_methyl', 'fit', 'lonsurf', 'exercis', 'pain', 'east_asian', 'colonoscopi', 'adenoma', 'tgf-\xce\xb2', 'g12d', 'rs704017', 'surgeri', 'faecal_m2-pk', 'polyploidi_test_result', 'msh6_loss', 'inherit_genet_disord', 'lgr5', 'kra_mutat', 'submucos_invas_colon_adenocarcinoma', 'bmi', 'r_classif', 'rs9929218', 'sigmoidoscopi', 'stem_cell', 'mutyh-associ_polyposi', '5-fu', 'vegf', 't3\xe2\x80\x93t4a-n2b-m0', 'nonpolyposi_syndrom', 't1-n2a-m0', 'hyperthermia', 'high_fat_intak', 'type_of_care', 'g3', 'popul_base_snp', 'alk', 'mir-92a', 'cd166', 'anal_gland_neoplasm', 't4a-n0-m0', 'metastasi', 'd5s346', 'rs10849432', 'blister', 'rs61764370', 'rs1801155', 'plod1', 'c18.3', 'optic_colonoscopi', 'mir-31', 'rs16892766', 'iv', 'rectosigmoid_cancer', 'panitumumab', 't3-n0-m0', 'mir-17', 'gx', 'fish', 'cognit_dysfunct', 'egfr', 'rs1801166', 'prognost_factor', 'bladder_irrit', 'acut_myelocyt_leukemia', 'tym', 'uicc_stage', 'folfox', 'lipomat_hemangiopericytoma', 'rs6691170', 'aldh1', 'tumor_bud', 'mutyh', 'mss', 'grade', 'attenu_famili_adenomat_polyposi', 'colon_adenocarcinoma', 'high_sensit_faecal_occult_blood_test', 'samson_gardner_syndrom', 'colon_mucin_adenocarcinoma', 'pmmr', 'tp53', 'g463v', 'capsul_colonoscopi', 'colon_squamou_cell_carcinoma', 'rectal_irrit', 'c18.1', 'hra', 'ceacam5', 'neodymium:yttrium-aluminum-garnet', 'cetuximab', 'folfiri', 'rs6983267', 'msi-l', 'c18' ] #NOTE only Fixed_Multi-Tag model has this update! vocab_list = [any2unicode(element) for element in vocab_list] dim = 200 win = 8 neg = 10 kwargs = { "sent": contents, "vocab": vocab_list, "dim": dim, "win": win, "min_cnt": 2, "neg": neg, "iter": 20, "tag_doc": contents } Dis2Vec(**kwargs).run_Dis2Vec()