def file_to_features(path, word_vocab, window, min_count, total_w): examples = [] toktok = ToktokTokenizer() punckt = set(string.punctuation) try: with open(path, 'r', encoding='utf8') as f: for line in f: for sentence in sent_tokenize(line): words_1 = toktok.tokenize(sentence) words_2 = [] for i, word in enumerate(words_1): word_l = word.lower() if word_l not in word_vocab: continue if word_vocab[word_l] < min_count: continue if word in punckt: continue frequency = word_vocab[word_l] / total_w number = 1 - math.sqrt(10e-5 / frequency) if random.uniform(0, 1) <= number: continue words_2.append(word) max_j = len(words_2) for i, word in enumerate(words_2): start = i - window if (i - window) > 0 else 0 to = i + window if (i + window) < max_j else max_j for j in range(start, to): if i == j: continue target = words_2[j] examples.append((word, target)) except Exception as error: print(error) return examples
def build_vocabs(directory_path, min_count): """Build the word and char counter vocabularies""" toktok = ToktokTokenizer() word_vocab = Counter() char_vocab = Counter() char_vocab.update(['{', '}']) filenames = os.listdir(directory_path) filepaths = [os.path.join(directory_path, e) for e in filenames] for i, filepath in enumerate(filepaths): if i % 100 == 0: print('Reading file number {}'.format(i), end="\r") with open(filepath, 'r', encoding='utf8') as f: try: line = f.read() if 'numbers_' in filepath: tmp = toktok.tokenize(line.lower()) for i in range(min_count): word_vocab.update(tmp) else: word_vocab.update(word_tokenize(line.lower())) char_vocab.update(line) except Exception as error: print('Error with file: {}'.format(filepath)) print(error) return word_vocab, char_vocab
def tokenize(i_file, o_file): toktok = ToktokTokenizer() with open(i_file, 'r') as i_f, open(o_file, 'w') as o_f: for line in tqdm(i_f): line = line.rstrip('\n') tokens = toktok.tokenize(line) print(' '.join(tokens), file=o_f)
def extract_wiki_fdict(): f_count = 0 # for each wiki table, get header name, and corresponding content f = open(wiki_path, 'r') f_dest = open(wiki_fdict_path, 'w') toktok = ToktokTokenizer() tid = 0 pool = mp.Pool() for line in f: tid += 1 t = json.loads(line) if not check_format(t): continue try: # header process header_iter = iter(t['tableHeaders'][-1]) header_span = [] header_content = dict() header_bows = dict() header_idx = 0 for each_header in header_iter: html_desc = each_header['tdHtmlString'] span = int(html_desc.split('colspan="')[1].split('"')[0]) header_span.append((each_header['text'], span)) header_content[header_idx] = [] header_bows[header_idx] = [] header_idx += 1 if span != 1: for skip_num in range(span - 1): next(header_iter) # content process for row in t['tableData']: global_col_index = 0 header_idx = 0 for header, span in header_span: for idx in range(span): if row[global_col_index]['text'] != '': header_content[header_idx].append( row[global_col_index]['text']) header_bows[header_idx].extend( toktok.tokenize(row[global_col_index]['text'])) global_col_index += 1 header_idx += 1 except: continue #combine header and features cols_features = pool.map(gov_data.get_col_features, list(header_content.values())) all_col_features = list( zip([each[0] for each in header_span], cols_features)) for i in range(len(all_col_features)): if all_col_features[i][1]: all_col_features[i][1]['content'] = header_bows[i] all_col_features = list(filter(lambda x: x[1], all_col_features)) f_dest.write(json.dumps({tid: all_col_features}, cls=MyEncoder) + '\n') print("finishing {0}".format(f_count)) f_count += 1
def loss_char(sentence, position): toktok = ToktokTokenizer() if sentence[position] in " ,./;'[]\<>?:{}!@#$% ^&*()": return sentence if sentence[position] == " ": return sentence if sentence[position] in toktok.tokenize(sentence): return sentence return sentence[:position] + sentence[position + 1:]
def RemoveWords_by_tag(text): remove_tag_list = ['JJ', 'JJR', 'JJS', 'RBR', 'RBS'] token = ToktokTokenizer() words = token.tokenize(text) words_tagged = nltk.pos_tag(words) filtered = untag([ w for w in words_tagged if not w[1] in remove_tag_list ]) # Filtre les mots qui n'appartiennt pas à la catégorie à supprimer return ' '.join(map(str, filtered))
def extract_wiki_features(wiki_feature_path, wiki_bow_path): f_count = 0 # for each wiki table, get header name, and corresponding content f = open(wiki_path, 'r') f_dest = open(wiki_feature_path, 'w') f_bow = open(wiki_bow_path, 'w') toktok = ToktokTokenizer() for line in f: t = json.loads(line) if not check_format(t): continue try: # header process header_iter = iter(t['tableHeaders'][-1]) header_span = [] header_content = dict() header_bows = dict() header_idx = 0 for each_header in header_iter: html_desc = each_header['tdHtmlString'] span = int(html_desc.split('colspan="')[1].split('"')[0]) header_span.append((each_header['text'], span)) header_content[header_idx] = [] header_bows[header_idx] = [] header_idx += 1 if span != 1: for skip_num in range(span - 1): next(header_iter) # content process for row in t['tableData']: global_col_index = 0 header_idx = 0 for header, span in header_span: for idx in range(span): if row[global_col_index]['text'] != '': header_content[header_idx].append( row[global_col_index]['text']) header_bows[header_idx].extend( toktok.tokenize(row[global_col_index]['text'])) global_col_index += 1 header_idx += 1 except: continue #combine header and features for col, f_dict, bows in zip([each[0] for each in header_span], map(get_col_features, header_content.values()), header_bows.values()): if f_dict: f_dict['_id'] = t['_id'] f_dest.write(json.dumps({col: f_dict}) + '\n') f_bow.write(json.dumps({col: bows}) + '\n') print("finishing {0}".format(f_count)) f_count += 1
def extract_gov_fdict(all_resources, fdict_path=gov_data_fdict_path, tid_type='cat_id', restrict_resource=False): #extracting features: #table_id;label,curated_features,content;label,curated_features... f = open(fdict_path, 'w') #all_resources = gov_data.read_resources() all_resources = gov_data.wrong_csv(all_resources) all_resources = list(filter(lambda x: x.status, all_resources)) if restrict_resource: all_resources = gov_data.select_resources(all_resources, fsize=50, rs_ct=len(all_resources)) pool = mp.Pool() total = len(all_resources) count = 0 toktok = ToktokTokenizer() for resource in all_resources: print("processing {0}-th resource".format(count)) for each_data in resource.data_files: try: if tid_type == 'cat_id': tid = resource.rs_id + ':' + each_data.df_id elif tid_type == 'path': tid = resource.path + '/' + each_data.df_id d_path = each_data.path + '/data.csv' df = pd.read_csv(d_path, delimiter=',', quotechar='"', dtype=str, na_filter=True) cols = df.columns contents = [ df[each_col].dropna().tolist() for each_col in cols ] print("extract content finished") cols_features = pool.map(gov_data.get_col_features, contents) all_col_features = list(zip(cols, cols_features)) for i in range(len(all_col_features)): if all_col_features[i][1]: all_col_features[i][1]['content'] = toktok.tokenize( ' '.join(contents[i])) all_col_features = list( filter(lambda x: x[1], all_col_features)) f.write( json.dumps({tid: all_col_features}, cls=MyEncoder) + '\n') except Exception as e: print(e) count += 1 print("finish {0} out of {1}".format(count, total)) f.close() return all_resources
class Solver(AbstractSolver): def __init__(self): self.morph = morph self.toktok = ToktokTokenizer() self.bert = BertEmbedder() def get_num(self, text): lemmas = [ self.morph.parse(word)[0].normal_form for word in self.toktok.tokenize(text) ] if "указывать" in lemmas and "предложение" in lemmas: w = lemmas[lemmas.index("указывать") + 1] d = {"один": 1, "два": 2, "три": 3, "четыре": 4, "предложение": 1} if w in d: return d[w] elif "указывать" in lemmas and "вариант" in lemmas: return 2 return 2 def compare_text_with_variants(self, variants): variant_vectors = self.bert.sentence_embedding(variants) predicts = [] for i in range(0, len(variant_vectors)): for j in range(i + 1, len(variant_vectors)): sim = cosine_similarity( variant_vectors[i].reshape(1, -1), variant_vectors[j].reshape(1, -1) ).flatten()[0] predicts.append(pd.DataFrame({"sim": sim, "i": i, "j": j}, index=[1])) predicts = pd.concat(predicts) indexes = predicts[predicts.sim == predicts.sim.max()][["i", "j"]].values[0] return sorted([str(i + 1) for i in indexes]) def sent_split(self, text): reg = r"\(*\d+\)" return re.split(reg, text) def process_task(self, task): first_phrase, task_text = re.split(r"\(*1\)", task["text"])[:2] variants = [t["text"] for t in task["question"]["choices"]] text, task = "", "" if "Укажите" in task_text: text, task = re.split("Укажите ", task_text) task = "Укажите " + task elif "Укажите" in first_phrase: text, task = task_text, first_phrase return text, task, variants def predict_from_model(self, task): text, task, variants = self.process_task(task) result = self.compare_text_with_variants(variants) return result
def index(request): global invertedIndex global jsonData output_links = [] searchTermsReq = request.GET.get('term', '') print(searchTermsReq) tokenizer = ToktokTokenizer() searchTerms = tokenizer.tokenize(searchTermsReq) print(searchTerms) response = {} output_data = defaultdict(int) output_links = [] for token in searchTerms: token = token.lower() if invertedIndex[token]['idf'] > 0.25 and len(token) > 1: print('Looking through high for: ' + token) for docFilePath in invertedIndex[token]['high']: tfidf = invertedIndex[token]['high'][docFilePath] output_data[docFilePath] += tfidf if (len(output_data) < 10): for token in searchTerms: token = token.lower() if invertedIndex[token]['idf'] > 0.25 and len(token) > 1: print('Looking through low for: ' + token) for docFilePath in invertedIndex[token]['low']: tfidf = invertedIndex[token]['low'][docFilePath] output_data[docFilePath] += tfidf output_data = sorted(output_data.items(), key=itemgetter(1), reverse=True) for docFilePath, tfidf in output_data[:10]: output_links.append((jsonData[docFilePath], tfidf)) output_links.sort(key=itemgetter(1), reverse=True) response['term'] = searchTermsReq response['results'] = output_links response['totalURLs'] = len(output_data) response['uniqueTokens'] = len(invertedIndex) response['totalDocuments'] = len(jsonData) return JsonResponse(response)
def error_generator(utterance): toktok = ToktokTokenizer() length = len(utterance) nb = nb_of_errors_in_utterance(length) + 1 utterance = utterance + " " for i in range(nb): length = len(utterance) - 1 position = np.random.choice(range(length), p=(length) * [1 / (length)]) l = len(toktok.tokenize(utterance)) utterance_old = utterance nb = np.random.randint(1, 5) utterance = functions[nb](utterance, position) return utterance
def clean_archive_data(folder): toktok = ToktokTokenizer() if not os.path.exists(f"{folder}-cleaned"): os.makedirs(f"{folder}-cleaned") for count, file in enumerate(os.listdir(f"{folder}")): if count % 1000 == 0: print(count) file_data = open(f"{folder}/{file}", "r").read() try: text_newspaper = toktok.tokenize(fulltext(file_data)) text_newspaper_cleaned = clean(" ".join(text_newspaper)) with open(f"{folder}-cleaned/{file}", "w") as output: output.write(text_newspaper_cleaned) except: # pylint: disable=W0702 print(f"error with {file}", file=sys.stderr)
def preprocess(data): X, Y = [], [] toktok = ToktokTokenizer() for index, review in data.iterrows(): if (index+1) % 100000 == 0: print(index+1) # words = nltk.word_tokenize(review['text']) tokens = toktok.tokenize(review['text'].lower()) X.append(tokens) # X.append(nltk.word_tokenize(review['text'])) Y.append(int(review['stars'] - 1)) # if len(Y) == 10000: # break df_new = pd.DataFrame({'text': X, 'stars': Y}) return df_new
def build_vocabs(filepath, min_count): """Build the word and char counter vocabularies""" toktok = ToktokTokenizer() word_vocab = Counter() char_vocab = Counter() with open(filepath, 'r', encoding='utf8') as f: try: line = f.read() if 'numbers_' in filepath: tmp = toktok.tokenize(line.lower()) for i in range(min_count): word_vocab.update(tmp) else: word_vocab.update(word_tokenize(line.lower())) char_vocab.update(line) except Exception as error: print('Error with file: {}'.format(filepath)) print(error) return word_vocab, char_vocab
def word_frequencies(contents): toktok = ToktokTokenizer() string_corpus = brown.raw() # Frequencies for each file list = [] for file in contents.keys(): print("Tokenising", file) tokenised = [ toktok.tokenize(sent) for sent in sent_tokenize(string_corpus) ] fdist = Counter(chain(*tokenised)) list.append(fdist) # Combine keys into one set, eliminating duplicates print("Making frequency distribution of all words that we care about.") keys = [] for sublist in list: keys += sublist keys = set(keys) # Build combined frequency dict # Tuple of identifiers for connectives and other common words unwanted = ('at', 'to', 'in', 'ma', 'bez', 'ppss', 'pp$', 'dt', 'bedz', 'hv', 'cc', 'cs', 'hvd', 'wdt', '*', 'bed', 'ber', 'be', 'np$', 'ppo', 'pps', 'abn', 'cd', 'md', 'ben', 'ben', 'wps', 'vbd', 'jj', 'rb', 'do', 'ql', 'dts', 'rp', 'in-tl', 'ex', 'i', 'dti', 'dod', 'wrb', 'hvz', 'nn$') # This is far from the best way to do this, but I couldn't find the documentation for these identifiers frequencies = {} for key in keys: total = 0 if (key[0] not in string.punctuation) and ( key.split('/')[-1] not in unwanted): # Gets rid of unwanted tokens for sublist in list: if key in sublist.keys(): total += sublist[key] frequencies[key.split('/')[0].lower()] = total print("Total words (that we care about): " + str(len(frequencies.keys()))) return frequencies
def select_DW_columns(labels): wl = set() for each in brown.words(): each = each.lower() if each.isalpha() and (each not in wl): wl.add(each) DW_labels = [] DW_idx = [] toktok = ToktokTokenizer() for idx, label in enumerate(labels): tokens = toktok.tokenize(label) flag = True for token in tokens: if token.isdigit(): continue elif token.lower() not in wl: flag = False break if flag: DW_labels.append(label) DW_idx.append(idx) return DW_idx, DW_labels
class Dictionary(object): ''' TODO: a lot of cases to handle the errors 1. no such file file (due to download errors) 2. have the file with data.csv, but is not csv file -> col errors ''' def __init__(self): self.wl = set() for each in brown.words(): each = each.lower() if each.isalpha() and (each not in self.wl): self.wl.add(each) self.toktok = ToktokTokenizer() def isDW(self, label): tokens = self.toktok.tokenize(label) flag = True for token in tokens: if token.lower() not in self.wl: flag = False break return flag
def preprocess_advanced(data): X, Y = [], [] toktok = ToktokTokenizer() en_stop = set(stopwords.words('english')) p_stemmer = PorterStemmer() for index, review in data.iterrows(): if (index+1) % 100000 == 0: print(index+1) # words = nltk.word_tokenize(review['text']) tokens = toktok.tokenize(review['text'].lower()) # tokens = word_tokenize(doc.lower()) stopped_tokens = filter(lambda token: token not in en_stop, tokens) stemmed_tokens = map(lambda token: p_stemmer.stem(token), stopped_tokens) # if not return_tokens: # return ' '.join(stemmed_tokens) # return list(stemmed_tokens) X.append(list(stemmed_tokens)) # X.append(nltk.word_tokenize(review['text'])) Y.append(int(review['stars'] - 1)) if len(Y) == 1000: break df_new = pd.DataFrame({'text': X, 'stars': Y}) return df_new
class Solver(BertEmbedder): def __init__(self, seed=42): super(Solver, self).__init__() self.is_train_task = False self.morph = pymorphy2.MorphAnalyzer() self.toktok = ToktokTokenizer() self.seed = seed self.init_seed() def init_seed(self): random.seed(self.seed) def predict(self, task): return self.predict_from_model(task) def get_num(self, text): lemmas = [ self.morph.parse(word)[0].normal_form for word in self.toktok.tokenize(text) ] if 'указывать' in lemmas and 'предложение' in lemmas: w = lemmas[lemmas.index('указывать') + 1] # first d = {'один': 1, 'два': 2, 'три': 3, 'четыре': 4, 'предложение': 1} if w in d: return d[w] elif 'указывать' in lemmas and 'вариант' in lemmas: return 'unknown' return 1 def compare_text_with_variants(self, text, variants, num=1): text_vector = self.sentence_embedding([text]) variant_vectors = self.sentence_embedding(variants) i, predictions = 0, {} for j in variant_vectors: sim = cosine_similarity(text_vector[0].reshape(1, -1), j.reshape(1, -1)).flatten()[0] predictions[i] = sim * (len(variants[i])**(1 / 5)) i += 1 #print(1,predictions) #indexes = sorted(predictions.items(), key=operator.itemgetter(1), reverse=True)[:num] #print(2,indexes) #return [str(i[0] + 1) for i in indexes] return predictions def sent_split(self, text): reg = r'\(*\d+\)' return re.split(reg, text) def process_task(self, task): first_phrase, task_text = re.split(r'\(*1\)', task['text'])[:2] variants = [ t['text'].replace("—", "").replace("<...>", "").replace( "<…>", "").replace(",", "").replace(".", "").replace(":", "").replace( "»", "").replace("«", "").replace("-", " ") for t in task['question']['choices'] ] text, task = "", "" if 'Укажите' in task_text: text, task = re.split('Укажите ', task_text) task = 'Укажите ' + task elif 'Укажите' in first_phrase: text, task = task_text.replace("—", "").replace( "<...>", "").replace("<…>", "").replace(",", "").replace( ".", "").replace(":", "").replace("»", "").replace( "«", "").replace("-", " "), first_phrase return text, task, variants def fit(self, tasks): pass def load(self, path=""): pass def save(self, path=''): pass def predict_from_model(self, task, num=2): #print(task["id"]) text, task, variants = self.process_task(task) text = re.sub('[0-9]*\)', '', text).replace(' ', ' ').replace(' ', ' ') for i, _ in enumerate(variants): variants[i] = re.sub('[0-9]*\)', '', variants[i]) variants[i] = variants[i].replace(' ', ' ').replace(' ', ' ') #print(text) #print(variants) result = self.compare_text_with_variants(text, variants, num=num) text = [text] text.extend(variants) result2 = self.compare_text_with_variants2(text) indexes1 = sorted(result.items(), key=operator.itemgetter(1), reverse=True)[:num] #print(1,[str(i[0] + 1) for i in indexes1]) indexes2 = sorted(result2.items(), key=operator.itemgetter(1), reverse=True)[-num:] #print(2,[str(i[0] + 1) for i in indexes2]) symm1, symm2 = 0, 0 for i in range(len(result)): symm1 += result[i] symm2 += result2[i] dif = symm2 / symm1 for i in range(len(result)): #print(i+1,result[i],result2[i]) result[i] -= result2[i] / (dif * 4) #print(result) indexes = sorted(result.items(), key=operator.itemgetter(1), reverse=True)[:num] ans = [str(i[0] + 1) for i in indexes] return sorted(ans)
def create_lol_attack_ontology(lolbin_data, attack_windows): ''' Takes in lolbin and attack lists and returns the resulting merged ontology :param lolbin_data: list of dictionaries of parsed lolbins :param attack_windows: list of dictionaries of parsed lolbins :return: merged ontology ''' functions_to_attack = {'ADS' : 'T1096', 'Compile' : 'T1127', 'Create Service' : 'T1050', 'Start Service' : 'T1035', 'NTDS.dit' : 'T1003', 'UACBypass' : 'T1088', 'Download' : 'T1105'} toktok = ToktokTokenizer() ontology = {} for i in range(len(lolbin_data)): name = lolbin_data[i].get('name') functions = lolbin_data[i].get('functions', []) examples = lolbin_data[i].get('examples', []) lol_link = lolbin_data[i].get('link', None) if lol_link is None: lol_link = [] short_name = name.split('.')[0] # clean up cases where the list of examples has comments and unrelated lines examples = [example for example in examples if short_name.lower() in example.lower()] found = False attack_tid_strong = set() attack_tid_weak = set() for attack in attack_windows: attack_name = attack.get('name') description = attack.get('description').lower() description_tokenized = toktok.tokenize(description) if name in description_tokenized: attack_tid_strong.add(attack.get('tid')) if short_name in description_tokenized: attack_tid_weak.add(attack.get('tid')) for function in functions: for k, v in functions_to_attack.items(): if k in function: attack_tid_strong.add(v) ontology[name.lower()] = {'functions' : functions, 'examples' : examples, 'attack_ids_strong' : attack_tid_strong, 'attack_ids_weak' : attack_tid_weak, 'short_name' : short_name, 'references' : lol_link} # One more pass. If all the examples for an executable, library, or script involve being invoked by a different executable, we will change the mapping ontology_tools = ontology.keys() for name, data in ontology.items(): examples = data.get('examples') # Get deduped list of initial executable or tool name used in the example for each example tool_in_example = set() for example in examples: tokens = example.split() if len(tokens) > 0: tool_in_example.add(tokens[0].strip().lower()) # Check if the tools listed are actually directly a MITRE ATT&CK technique. If so, directly map to it. if len(tool_in_example) == 1: tool_name = tool_in_example.pop().strip().lower().split('.')[0] # the tool in the example is the only one given and it is different from the primary lolbas name for attack in attack_windows: attack_short_name = attack['name'].split('.')[0].lower() tid = attack['tid'] if tool_name == attack_short_name: ontology[name]['attack_ids_strong'].add(tid) # Exceptions: clear_weak = ['regsvr32.exe', 'powershell.exe', 'control.exe', 'expand.exe', 'winword.exe', 'explorer.exe', 'replace.exe', 'bash.exe'] clear_strong = ['winword.exe', 'explorer.exe', 'replace.exe', 'bash.exe'] for weak_to_clear in clear_weak: ontology[weak_to_clear]['attack_ids_weak'] = set() for strong_to_clear in clear_strong: ontology[strong_to_clear]['attack_ids_strong'] = set() ontology['powershell.exe']['attack_ids_strong'] = set(['T1086']) # Remove misclassifications try: ontology['sc.exe']['attack_ids_weak'].remove('T1197') except: pass try: ontology['url.dll']['attack_ids_weak'].remove('T1192') except: pass try: ontology['sc.exe']['attack_ids_strong'].remove('T1013') except: pass add_scripting = ['testxlst.js', 'scriptrunner.exe', 'runscripthelper.exe', 'msdeploy.exe', 'manage-bde.wsf', 'te.exe', 'cscript.exe'] for add_script in add_scripting: ontology[add_script]['attack_ids_strong'].add('T1064') ontology['ieexec.exe']['attack_ids_strong'].add('T1105') ontology['msiexec.exe']['attack_ids_strong'].add('T1105') ontology['ieexec.exe']['functions'] = list(set(ontology['ieexec.exe']['functions']).union(['Download'])) ontology['msiexec.exe']['functions'] = list(set(ontology['msiexec.exe']['functions']).union(['Download'])) ### Add T1202 indirect execution indirect_execution = ['explorer.exe', 'dnscmd.exe', 'winword.exe', 'extexport.exe', 'vsjitdebugger.exe', 'csi.exe', 'hh.exe', 'appvlp.exe', 'scriptrunner.exe', 'dxcap.exe', 'ieexec.exe', 'openwith.exe', 'pcwrun.exe', 'msiexec.exe', 'bash.exe', 'msdeploy.exe', 'mftrace.exe'] for indirect_exec in indirect_execution: ontology[indirect_exec]['attack_ids_strong'].add('T1202') # Combine all the strong and weak technique IDs for name, data in ontology.items(): data['attack_ids'] = list(data['attack_ids_strong'].union(data['attack_ids_weak'])) data.pop('attack_ids_strong') data.pop('attack_ids_weak') return ontology
def __init__(self): data_path = config.data_path ratio = config.freq_ratio start_vocabs = config.start_vocabs self.buckets = config.buckets print("Reading 'tasks.csv' file...") with open(data_path, 'r', encoding="utf-8") as f: reader = csv.reader(f, skipinitialspace=True) next(reader) sentences = [x[0].lower() for x in reader] self.sentences = sentences[:train_size] print("{} sentences loaded.".format(len(self.sentences))) # tokenize sentences tok = ToktokTokenizer() self.tokenized_sens = [tok.tokenize(sen) for sen in self.sentences] # clean sentences and only consider sentences with length > 1 self.tokenized_sens = [[x for x in sen if x.isalpha()] for sen in self.tokenized_sens] self.tokenized_sens = [ sen for sen in self.tokenized_sens if sen != [] and len(sen) > 1 ] # remove low frequency words and index them frequency_words = nltk.FreqDist(itertools.chain(*self.tokenized_sens)) size = len(list(set(itertools.chain(*(self.tokenized_sens))))) self.vocabs = start_vocabs + [ w[0] for w in frequency_words.most_common(int(size * ratio)) ] self.vocab_size = len(self.vocabs) self.word_to_index = dict([(w, i) for i, w in enumerate(self.vocabs)]) self.tokenized_sens = [[ w if w in self.vocabs else '_unk' for w in sen ] for sen in self.tokenized_sens] # create train data self.x_train = [[self.word_to_index[w] for w in sen[:-1]] for sen in self.tokenized_sens] self.y_train = [[self.word_to_index[w] for w in sen[1:]] for sen in self.tokenized_sens]
test['brand'] = le.transform(test.brand_name) del le, train['brand_name'], test['brand_name'] # Replace the category slash test["category_name_split"] = test["category_name"].str.replace(' ', '_') train["category_name_split"] = train["category_name"].str.replace(' ', '_') test["category_name_split"] = test["category_name_split"].str.replace('/', ' ') train["category_name_split"] = train["category_name_split"].str.replace( '/', ' ') train.head() print('[{}] Finished PROCESSING CATEGORICAL DATA...'.format(time.time() - start_time)) toktok = ToktokTokenizer() train['name_token'] = [ " ".join(toktok.tokenize(sent)) for sent in train['name'].str.lower().tolist() ] test['name_token'] = [ " ".join(toktok.tokenize(sent)) for sent in test['name'].str.lower().tolist() ] print('[{}] Finished Tokenizing text...'.format(time.time() - start_time)) #PROCESS TEXT: RAW print("Text to seq process...") print(" Fitting tokenizer...") import re rgx = re.compile('[%s]' % '!"#%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
class Solver(BertEmbedder): def __init__(self, seed=42): super(Solver, self).__init__() self.is_train_task = False self.morph = pymorphy2.MorphAnalyzer() self.toktok = ToktokTokenizer() self.seed = seed self.init_seed() def init_seed(self): random.seed(self.seed) def predict(self, task): return self.predict_from_model(task) def get_num(self, text): lemmas = [self.morph.parse(word)[0].normal_form for word in self.toktok.tokenize(text)] if 'указывать' in lemmas and 'предложение' in lemmas: w = lemmas[lemmas.index('указывать') + 1] # first d = {'один': 1, 'два': 2, 'три': 3, 'четыре': 4, 'предложение': 1} if w in d: return d[w] elif 'указывать' in lemmas and 'вариант' in lemmas: return 'unknown' return 1 def compare_text_with_variants(self, text, variants, num=1): text_vector = self.sentence_embedding([text]) variant_vectors = self.sentence_embedding(variants) i, predictions = 0, {} for j in variant_vectors: sim = cosine_similarity(text_vector[0].reshape(1, -1), j.reshape(1, -1)).flatten()[0] predictions[i] = sim i += 1 indexes = sorted(predictions.items(), key=operator.itemgetter(1), reverse=True)[:num] return sorted([str(i[0] + 1) for i in indexes]) def sent_split(self, text): reg = r'\(*\d+\)' return re.split(reg, text) def process_task(self, task): first_phrase, task_text = re.split(r'\(*1\)', task['text'])[:2] variants = [t['text'] for t in task['question']['choices']] text, task = "", "" if 'Укажите' in task_text: text, task = re.split('Укажите ', task_text) task = 'Укажите ' + task elif 'Укажите' in first_phrase: text, task = task_text, first_phrase return text, task, variants def fit(self, tasks): pass def load(self, path=""): pass def save(self, path=''): pass def predict_from_model(self, task, num=2): text, task, variants = self.process_task(task) result = self.compare_text_with_variants(text, variants, num=num) return result
class Pipeline: def __init__(self, stopwords: Set[str]) -> None: self.stopwords = stopwords self.ps = WordNetLemmatizer() self.stemmer = SnowballStemmer("english") self.tokenizer = ToktokTokenizer() self.puncuation = set(string.punctuation) # self.words = set(nltk.corpus.words.words()) self.pipeline = [ self.remove_punctuation, self.tokenize, self.lowering, self.remove_words, self.remove_stopwords, self.remove_digits_and_punctuation, self.remove_dangling_puncuation, self.remove_single, self.stemm, self.remove_starting_with_file, ] self.words_to_remove = set( "edit wookieepedia format registerr wrapup wiki sandbox click edit page link code preview button format" .split(" ")) def remove_starting_with_file( self, document_iterable: Iterable[str]) -> Iterable[str]: for word in document_iterable: if not word.startswith("file"): yield word def remove_words(self, document_iterable: Iterable[str]) -> Iterable[str]: for word in document_iterable: if word not in self.words_to_remove: yield word def cleanup_space(self, a_string: str) -> str: return a_string.replace("|", " ").replace("\n", " ") def remove_dangling_puncuation( self, document_iterable: Iterable[str]) -> Iterable[str]: return [w.strip(string.punctuation) for w in document_iterable] def tokenize(self, document: str) -> Iterable[str]: for word in self.tokenizer.tokenize(document): yield word def remove_digits_and_punctuation( self, document_iterable: Iterable[str]) -> Iterable[str]: return [ w for w in document_iterable if not all(x.isdigit() or x in self.puncuation for x in w) ] def remove_punctuation(self, document: str) -> str: return document.translate(str.maketrans("", "", string.punctuation)) def remove_stopwords(self, document_iterable: Iterable[str]) -> Iterable[str]: for word in document_iterable: if word_not_in_set(word, self.stopwords): yield word def remove_everything_with_digit( self, document_iterable: Iterable[str]) -> Iterable[str]: return [ w for w in document_iterable if not any(x.isdigit() for x in w) ] def lowering(self, document_iterable: Iterable[str]) -> Iterable[str]: for word in document_iterable: yield word.lower() def lemmatize(self, document_iterable: Iterable[str]) -> Iterable[str]: for word in document_iterable: yield self.ps.lemmatize(word) def stemm(self, document_iterable: Iterable[str]) -> Iterable[str]: for word in document_iterable: yield self.stemmer.stem(word) def remove_single(self, document_iterable: Iterable[str]) -> Iterable[str]: for word in document_iterable: if len(word) > 1: yield word def pipe(self, document: str) -> Iterable[str]: ob = document for task in self.pipeline: ob = task(ob) return ob
# TODO count the number of words per summary/ summary stats: toktok = ToktokTokenizer() num_sents_list = [] num_words_list = [] word_counter = Counter() word_counter_lower = Counter() for count, file in enumerate(os.listdir("/data/corpora/newser/data-final")): print(count) file_data = open( f"/data/corpora/newser/data-final/{file}/{file}.reference.txt", "r").read() summary_sents = sent_tokenize(file_data) num_sents_list.append(len(summary_sents)) num_words = 0 for sent in summary_sents: cur_words = toktok.tokenize(sent) word_counter.update(cur_words) word_counter_lower.update([word.lower() for word in cur_words]) num_words += len(cur_words) num_words_list.append(num_words) num_sents_np = np.array(num_sents_list) num_words_np = np.array(num_words_list) print( f"the average number of sentences per summary: {np.mean(num_sents_np)}\n") print(f"the std of sentences per summary: {np.std(num_sents_np)}\n") print(f"the average number of words per summary: {np.mean(num_words_np)}\n") print(f"the std of words per summary: {np.std(num_words_np)}\n") print(len(word_counter)) print(len(word_counter_lower))
le.fit(np.hstack([train.brand_name, test.brand_name])) train['brand'] = le.transform(train.brand_name) test['brand'] = le.transform(test.brand_name) del le, train['brand_name'], test['brand_name'] # Replace the category slash test["category_name_split"] = test["category_name"].str.replace(' ', '_') train["category_name_split"] = train["category_name"].str.replace(' ', '_') test["category_name_split"] = test["category_name_split"].str.replace('/', ' ') train["category_name_split"] = train["category_name_split"].str.replace('/', ' ') train.head() print('[{}] Finished PROCESSING CATEGORICAL DATA...'.format(time.time() - start_time)) toktok = ToktokTokenizer() train['name_token'] = [" ".join(toktok.tokenize(sent)) for sent in train['name'].str.lower().tolist()] test['name_token'] = [" ".join(toktok.tokenize(sent)) for sent in test['name'].str.lower().tolist()] #train['item_description_token'] = [" ".join(toktok.tokenize(sent[:400])) for sent in train['item_description'].str.lower().tolist()] #test['item_description_token'] = [" ".join(toktok.tokenize(sent[:400])) for sent in test['item_description'].str.lower().tolist()] print('[{}] Finished Tokenizing text...'.format(time.time() - start_time)) #PROCESS TEXT: RAW print("Text to seq process...") print(" Fitting tokenizer...") import re rgx = re.compile('[%s]' % '!"#%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n') @jit
class Solver(BertEmbedder): def __init__(self, seed=42): super(Solver, self).__init__() self.is_train_task = False self.morph = pymorphy2.MorphAnalyzer() self.toktok = ToktokTokenizer() self.seed = seed self.init_seed() def init_seed(self): random.seed(self.seed) def predict(self, task): return self.predict_from_model(task) def clean_text(self, text): newtext, logic = [], ["PREP", "CONJ", "Apro", "PRCL", "INFN", "VERB", "ADVB"] for token in self.toktok.tokenize(text): if any(tag in self.morph.parse(token)[0].tag for tag in logic): newtext.append(self.morph.parse(token)[0].normal_form) return ' '.join(newtext) def get_pos(self, text): pos, lemmas = 'word', [self.morph.parse(word)[0].normal_form for word in self.toktok.tokenize(text)] if 'сочинительный' in lemmas: pos = "CCONJ" elif 'подчинительный' in lemmas: pos = "SCONJ" elif 'наречие' in lemmas: pos = "ADV" elif 'союзный' in lemmas: pos = "ADVPRO" elif 'местоимение' in lemmas: pos = "PRO" elif 'частица' in lemmas: pos = "PART" return pos def get_num(self, text): lemmas = [self.morph.parse(word)[0].normal_form for word in self.toktok.tokenize(text)] if 'слово' in lemmas and 'предложение' in lemmas: d = {'один': 1, 'два': 2, 'три': 3, 'четыре': 4, 'первый': 1, 'второй': 2, 'третий': 3, 'четвертый': 4, } for i in lemmas: if i in d: return d[i] return 1 def sent_split(self, text): reg = r'\(\n*\d+\n*\)' return re.split(reg, text) def compare_text_with_variants(self, word, text, variants): sents = self.sent_split(text) for sent in sents: lemmas = [self.morph.parse(word)[0].normal_form for word in self.toktok.tokenize(text)] if word.lower() in lemmas: text = sent text_vector = self.sentence_embedding([text]) variant_vectors = self.sentence_embedding(variants) i, predictions = 0, {} for j in variant_vectors: sim = cosine_similarity(text_vector[0].reshape(1, -1), j.reshape(1, -1)).flatten()[0] predictions[i] = sim i += 1 indexes = sorted(predictions.items(), key=operator.itemgetter(1), reverse=True)[:1] return sorted([str(i[0] + 1) for i in indexes]) def process_task(self, task): try: first_phrase, task_text = re.split(r'\(\n*1\n*\)', task['text']) except ValueError: first_phrase, task_text = ' '.join(re.split(r'\(\n*1\n*\)', task['text'])[:-1]), \ re.split(r'\(\n*1\n*\)', task['text'])[-1] variants = [t['text'] for t in task['question']['choices']] text, task, word = "", "", "" if 'Определите' in task_text: text, task = re.split('Определите', task_text) task = 'Определите ' + task word = re.split('\.', re.split('значения слова ', text)[1])[0] elif 'Определите' in first_phrase: text, task = task_text, first_phrase word = re.split('\.', re.split('значения слова ', task)[1])[0] return text, task, variants, word def fit(self, tasks): pass def load(self, path="data/models/solver3.pkl"): pass def save(self, path='data/models/solver3.pkl'): pass def predict_from_model(self, task): text, task, variants, word = self.process_task(task) result = self.compare_text_with_variants(word, text, variants) return result
return (text) totalText = '' for x in df['Body']: ps = PreProcessing(x) totalText = totalText + " " + ps from wordcloud import WordCloud import matplotlib.pyplot as plt wc = WordCloud(max_font_size=60).generate(totalText) plt.figure(figsize=(16, 12)) plt.imshow(wc, interpolation="bilinear") import nltk freqdist = nltk.FreqDist(token.tokenize(totalText)) freqdist plt.figure(figsize=(16, 5)) freqdist.plot(20) totalText = '' for x in df['Title']: ps = PreProcessing(x) totalText = totalText + " " + ps from wordcloud import WordCloud import matplotlib.pyplot as plt wc = WordCloud(max_font_size=60).generate(totalText) plt.figure(figsize=(16, 12)) plt.imshow(wc, interpolation="bilinear")
class Solver(object): def __init__(self, seed=42): self.morph = morph self.mystem = Mystem() self.tokenizer = ToktokTokenizer() self.w2v = Word2vecProcessor() self.seed = seed self.init_seed() self.synonyms = None self.antonyms = None self.phraseology = None self.phraseologisms = None self.prep_synon = None self.set_f = None self.verbs_dict = None self.chasti_rechi = None self.set_f_2 = None self.is_loaded = False def init_seed(self): random.seed(self.seed) def lemmatize(self, text): return [ self.morph.parse(word)[0].normal_form for word in self.tokenizer.tokenize(text.strip()) ] def get_word(self, text): try: return re.split("»", re.split("«", text)[1])[0] except IndexError: return "" def get_pos(self, text): lemmas = [l for l in self.lemmatize(text) if l != " "] if "фразеологизм" in lemmas: pos = "PHR" elif "синоним" in lemmas: pos = "SYN" elif "антоним" in lemmas: pos = "ANT" elif "антонимический" in lemmas: pos = "ANT" elif "синонимический" in lemmas: pos = "SYN" else: pos = "DEF" return pos def full_intersection(self, small_lst, big_lst): if sum([value in big_lst for value in small_lst]) == len(small_lst): return True return False def sent_split(self, text): reg = r"\(*\n*\d+\n*\)" return re.split(reg, text) def search(self, text_lemmas, lst): for l in lst: if self.full_intersection(l, text_lemmas): return "".join(l) return "" def fit(self, tasks): pass @singleton def load(self, path="data/models/solvers/solver24"): self.synonyms = open( os.path.join(path, r"synonyms.txt"), "r", encoding="utf8" ).readlines() self.synonyms = [ re.sub("\.", "", t.lower().strip("\n")).split(" ") for t in self.synonyms ] self.synonyms = [[t for t in l if t] for l in self.synonyms] self.antonyms = open( os.path.join(path, r"antonyms.txt"), "r", encoding="utf8" ).readlines() self.antonyms = [t.strip(" \n").split(" - ") for t in self.antonyms] self.phraseology = open( os.path.join(path, r"phraseologisms.txt"), "r", encoding="utf8", ).readlines() self.phraseology = [ [ l for l in self.lemmatize(l) if l not in ["\n", " ", "...", "", ",", "-", ".", "?", r" (", r"/"] ] for l in self.phraseology ] self.phraseologisms = load_pickle(os.path.join(path, "phraseologisms.pckl")) self.prep_synon = pd.read_csv(os.path.join(path, "prep_synonyms.csv")) self.sber_phraseologs = pd.read_csv( os.path.join(path, "prep_phraseologisms.csv") ) self.set_f, self.verbs_dict, self.chasti_rechi, self.set_f_2 = load_pickle( os.path.join(path, "solver24.pkl") ) self.is_loaded = True def save(self, path="data/models/solvers/solver24"): pass @staticmethod def parse_task(task): regex = "(\([0-9]{1,2}\)|\s[0-9]{1,2}\)|[.!?-][0-9]{1,2}\))" p1 = "из предлож[а-яё]+\s+\(?[0-9]{1,2}\)?\s*[–—−-]\s*\(?[0-9]{1,2}\)?" p2 = "из предлож[а-яё]+\s+\(?[0-9]{1,2}\)?\s*" task = task["text"].lower() selector = None if re.findall(p1, task): q = re.findall(p1, task)[0] q = q.replace("(", "") q = q.replace(")", "") task = re.sub(p1, q, task) numbers = re.findall("[0-9]{1,2}", q) selector = list(range(int(numbers[0]), int(numbers[1]) + 1)) elif re.findall(p2, task): q = re.findall(p2, task)[0] q = q.replace("(", "") q = q.replace(")", "") q = "." + q task = re.sub(p2, q, task) numbers = re.findall("[0-9]{1,2}", q) selector = [int(numbers[0])] l = re.split("[.!?…]", task) l = [re.split(regex, x) for x in l] l = sum(l, []) l = [x.strip() for x in l] l = [x for x in l if len(x) > 0] text = [] i = 0 while i < len(l): line = [l[i]] i += 1 while ( re.match(regex, line[0]) and (i < len(l)) and not (re.match(regex, l[i])) ): line += [l[i]] i += 1 text.append(line) question = [x[0] for x in text if not re.match(regex, x[0])] if len(text[-1]) > 2: question += text[-1][2:] text[-1] = text[-1][:2] question = " ".join(question) text = [(x[0], " ".join(x[1:])) for x in text] text = [x for x in text if re.match(regex, x[0])] text_df = pd.DataFrame(text) text_df[0] = text_df[0].map(lambda x: int(x.replace("(", "").replace(")", ""))) if selector: tmp = text_df[text_df[0].isin(selector)] if tmp.shape[0] > 0: text_df = tmp else: print(">>>>> SELECTOR ERROR") return question, list(text_df[1]) def lemm_and_clear(self, text, morph): analyze = morph.analyze(text) lemm_text = [ (x["analysis"][0]["lex"] if x.get("analysis") else x["text"]) for x in analyze ] lemm_text = [ self.verbs_dict[x] if x in self.verbs_dict else x for x in lemm_text ] analyze = list(zip(lemm_text, [x["text"] for x in analyze])) lemm_text = [x for x in lemm_text if not re.match("\s+", x)] lemm_text = [x for x in lemm_text if re.match("\w+", x)] return lemm_text, analyze @staticmethod def find_subarray(arr1, anal_arr2): arr2 = [x[0] for x in anal_arr2] sourse_arr2 = [x[1] for x in anal_arr2] for i_arr2 in range(len(arr2) - 1, -1, -1): positions = [] last_positions = 0 for j_arr1, word1 in enumerate(arr1): for j_arr2, word2 in enumerate(arr2[i_arr2:]): if (word1 == word2) and (last_positions <= j_arr2): last_positions = j_arr2 positions.append(j_arr2) break if len(arr1) == len(positions): return sourse_arr2[i_arr2:][positions[0] : positions[-1] + 1] def suggest_prediction(self, task): question_task, text_task = self.parse_task(task) question_task_re = re.sub("[^а-яё]", "", question_task.lower()) if "фразеологизм" in question_task_re: lemm_text_task = [self.lemm_and_clear(x, self.mystem) for x in text_task] for num_source in range(0, self.phraseologisms[1].max() + 1): for seq, annotated_seq in lemm_text_task: for i in range(0, len(seq)): for j in range(1, self.phraseologisms[2].map(len).max() + 1): if (i + j) <= len(seq): if any( [ set(seq[i : i + j]) == set_f for set_f in self.phraseologisms[ self.phraseologisms[1] == num_source ][3] ] ): find_elements = seq[i : i + j] return ( "".join( self.find_subarray( find_elements, annotated_seq ) ) .lower() .replace(" ", "") ) elif "синоним" in question_task_re: if type(text_task) == list: text_task = " ".join(text_task) norm_text_task = self.lemm_and_clear(text_task, self.mystem) if "синонимкслов" in question_task_re: word = re.findall(r"(?<=к слову).*", question_task)[0] words = re.findall("\w+", word) words = [x.lower() for x in words] set_seq = set(norm_text_task[0]) select_syn = self.prep_synon[ self.prep_synon["MAIN"].isin(words) & self.prep_synon["Синоним"].isin(set_seq) ] select_syn = select_syn[select_syn["MAIN"] != select_syn["Синоним"]] select_syn = select_syn.sort_values("number") synon_result = select_syn[["MAIN", "Синоним"]].to_dict("split")["data"] if synon_result: tmp = [x for x in synon_result if x[0] == words[0]] if tmp: synon_result = tmp[0] else: synon_result = synon_result[0] for norm_w, real_w in norm_text_task[1]: if norm_w == synon_result[1]: return real_w.lower() elif re.match(".*синонимич.*пар.*", question_task_re) or ( "синонимы" in question_task_re ): result = [] set_seq = set(norm_text_task[0]) try: select_syn = self.prep_synon[ self.prep_synon["prep_MAIN"].isin(set_seq) & self.prep_synon["prep_Синоним"].isin(set_seq) ] select_syn = select_syn[ select_syn["prep_MAIN"] != select_syn["prep_Синоним"] ] select_syn = select_syn.sort_values("number") synon_result = set( select_syn[["prep_MAIN", "prep_Синоним"]].to_dict("split")[ "data" ][0] ) for norm_w, real_w in norm_text_task[1]: if norm_w in synon_result: result.append(real_w) if len(synon_result) == len(result): break return "".join(result).lower() except: pass result = [] set_seq = set(norm_text_task[0]) list_seq = list(set_seq) list_seq_w2v = [self.w2v.word_vector(i) for i in list_seq] list_seq = [x[0] for x in zip(list_seq, list_seq_w2v) if x[1] is not None] list_seq_w2v = [x for x in list_seq_w2v if x is not None] tmp = cosine_distances(np.stack(list_seq_w2v)) for i in range(tmp.shape[0]): tmp[i, i] += 1000 n1, n2 = np.unravel_index(tmp.argmin(), tmp.shape) synon_result = set((list_seq[n1], list_seq[n2])) for norm_w, real_w in norm_text_task[1]: if norm_w in synon_result: result.append(real_w) if len(synon_result) == len(result): break return "".join(result).lower() def predict_from_model(self, task): prediction = self.suggest_prediction(task) if not prediction: task_description, sentences = self.parse_task(task) prediction = "".join( random.choices( [ w.strip(punctuation) for w in self.tokenizer.tokenize(random.choice(sentences)) if w not in punctuation and not w.isdigit() ], k=2, ) ) return prediction
class Preprocessor(PreprocessorConfig): """A Preprocessor object inherits from a PreprocessorConfig object to initialize its parameters. Then, it does 5 things : 1. Detects and replaces numbers/float by a generic token 'FLOAT', 'INT' 2. Add spaces in between punctuation so that tokenisation avoids adding 'word.' to the vocabulary instead of 'word', '.'. 3. Lowers words 4. Recursive word phrases detection : with a simple probabilistic rule, gathers the tokens 'new', york' to a single token 'new_york'. 5. Frequency Subsampling : discards unfrequent words with a probability depending on their frequency. It works with 2 main methods, '.fit' and .'transform'. The first method fits the vocabulary (which implies to lower, tokenize, do the word phrase detection and frequency subsampling). Fitting the vocabulary implies to calculate word frequencies over all the corpus, which can be a challenge when parallelizing the code. The 'transform' method then uses the learned vocabulary to re-write clean files in the 'writing_dir' directory. This method is also parallelized over all the cpus available. Usage example: ```python prep = Preprocessor('/tmp/logdir') # We suppose we already have a # PreprocessorConfig saved in /tmp/logdir prep.fit('~/mydata/') prep.filter() prep.transform('~/mydata') ``` """ def __init__(self, log_dir, from_log=False): self.log_dir = log_dir if checkExistenceFile(os.path.join(log_dir, "PreprocessorConfig.json")): self.read_config() self.tok = ToktokTokenizer() self.parsing_char_ = sha1(b"sally14").hexdigest() self.fitted = False if from_log: self.fitted = True with open( os.path.join(self.log_dir, "vocabulary.json"), "r", encoding="utf-8", ) as f: self.vocabulary_ = json.load(f) with open( os.path.join(self.log_dir, "WordPhrases.json"), "r", encoding="utf-8", ) as f: p = json.load(f) self.phrasewords_ = { i.replace("_", self.parsing_char_): p[i] for i in p.keys() } def get_batches(self, filenames): """Defines the filename batches to multiprocess fitting and transformation Args: filenames : str or list of str a list of files or a directory containing the files to fit/ transform the data on. Returns: batches : list of list of str the list of batches (lists of filenames) """ if type(filenames) == str: if os.path.isdir(filenames): ls = glob(os.path.join(filenames, "*")) elif type(filenames) == list: ls = filenames else: logger.error("Bad type for filenames, must be str or list of str") batches = [] cpu = cpu_count() n = len(ls) if n >= cpu: for i in range(cpu - 1): batches.append(ls[(n // cpu) * i:(n // cpu) * (i + 1)]) batches.append(ls[(n // cpu) * (cpu - 1):]) else: batches = list(map(lambda x: [x], ls)) assert len(batches) == min(cpu, n) return batches def fit_batch(self, filebatch): """ Fits one batch Args: filebatch : list of str the list of file names in the given batch Returns: unig : dic fitted unigram dictionnary big : dic fitted bigram dictionnary """ unig = {} big = {} for file in filebatch: text = openFile(file) cleaned_text = self.clean(text) unig = melt_vocab_dic(get_unigram_voc(cleaned_text), unig) big = melt_vocab_dic( get_bigram_voc(cleaned_text, self.parsing_char_), big) del text del cleaned_text return [unig, big] def fit(self, filenames): """ Parallelizes the fitting & definition of vocabulary, dumped in self.log_dir Args: filenames : str or list of str the list of file names in the given batch """ logger.info("Started fitting") batches = self.get_batches(filenames) logger.info("Defined {} batches for multiprocessing".format( cpu_count())) logger.info("Starting parallelized fitting") pool = Pool(processes=cpu_count()) results = pool.map(self.fit_batch, batches) pool.close() pool.terminate() pool.join() logger.info("Received {} batches results") logger.info("Melting unigram and bigrams dictionnaries") self.unigram_dic_ = {} self.bigram_dic_ = {} for j in range(len(results)): self.unigram_dic_ = melt_vocab_dic(self.unigram_dic_, results[j][0]) self.bigram_dic_ = melt_vocab_dic(self.bigram_dic_, results[j][1]) results[j] = 0 # Clears memory del results gc.collect() with open(os.path.join(self.log_dir, "unigrams.json"), "w", encoding="utf-8") as f: json.dump(self.unigram_dic_, f) with open(os.path.join(self.log_dir, "bigrams.json"), "w", encoding="utf-8") as f: json.dump(self.bigram_dic_, f) def filter(self): """Filters the results based on the configuration, saves the vocabulary and the word phrases""" logger.info("Building word phrases score") with open(os.path.join(self.log_dir, "unigrams.json"), "r", encoding="utf-8") as f: self.unigram_dic_ = json.load(f) with open(os.path.join(self.log_dir, "bigrams.json"), "r", encoding="utf-8") as f: self.bigram_dic_ = json.load(f) self.build_score() self.phrasewords_ = {} self.phrasewords() self.vocabulary_ = {} self.build_vocab() self.wordcount2freq() logger.info("Subsampling unfrequent words") self.subsample_freq_dic() logger.info("Corpus fitted") self.fitted = True logger.info("Saving vocabulary") with open( os.path.join(self.log_dir, "vocabulary.json"), "w", encoding="utf-8", ) as f: json.dump(self.vocabulary_, f) self.save_word_phrases() self.get_summary() def clean(self, text): """Parses a text, tokenize, lowers and replace ints and floats by a special token Args: text : str a text represented as a string Returns: words : str a clean text """ words = self.tok.tokenize(text) words = " ".join( map(lambda x: convertFloat(convertInt(x.lower())), words)) return words def build_score(self): """ Add bigram score to the 'bigram_dic_' dictionnary. bigram_dic_ = {bigram : occurences} becomes: bigram_dic_ = {bigram : (occurences, score)} """ for bigrams in self.bigram_dic_.keys(): i, j = bigrams.split(self.parsing_char_) score = (self.bigram_dic_[bigrams] - self.params["phrases_delta"] ) / (self.unigram_dic_[i] * self.unigram_dic_[j]) self.bigram_dic_[bigrams] = (self.bigram_dic_[bigrams], score) def build_vocab(self): """ Create a dictionnary 'vocabulary_' which contains unigrams and word phrases, with their occurences. """ copy_dict = self.unigram_dic_.copy() for word in self.bigram_dic_: # First feed the vocabulary with bigrams : if word in self.phrasewords_: try: i, j = (word.replace(self.parsing_char_, " ", 1)).split() # delete unigrams if unigrams only appear in a given bigram if self.unigram_dic_[i] == self.phrasewords_[word]: try: # Delete element from copy_dict and not # unigram_dic_ del copy_dict[i] except: pass if self.unigram_dic_[j] == self.phrasewords_[word]: try: del copy_dict[j] except: pass self.vocabulary_[word.replace( self.parsing_char_, "_")] = self.phrasewords_[word] except: pass self.vocabulary_ = melt_vocab_dic(copy_dict, self.vocabulary_) def phrasewords(self): """ Create a dictionnary 'phrasewords_' which contains word phrases, with their occurences. """ for bigrams in self.bigram_dic_: if self.bigram_dic_[bigrams][1] > self.params["phrases_threshold"]: self.phrasewords_[bigrams] = self.bigram_dic_[bigrams][0] def wordcount2freq(self): """ Create the 'vocab_freq_' dictionnary : goes from a vocabulary_ dictionnary with occurences to a dictionnary of the vocabulary with frequencies. Useful for frenquency subsampling. """ count = 0 dico = self.vocabulary_ dico2 = {} for i in dico: count = count + dico[i] for i in dico: newkey = i.replace(self.parsing_char_, "_", 1) dico2[newkey] = dico[i] / count self.vocab_freq_ = dico2 def subsample_freq_dic(self): """ Vocab dictionnary frequency subsampling. $$p = 1 - \sqrt{\frac{t}{f}}$$ With $f$ the frequency of a given word, and $p$ probability to discard the word. """ t = self.params["freq_threshold"] vocab = self.vocab_freq_ for word in self.vocab_freq_.keys(): try: # In some very rare cases, doesn't work # Computing discarding word probability (Mik. 2013) freq = vocab[word] prob = 1 - sqrt(t / freq) # Simulating a uniform [0,1] # First initiate a random seed seed("sally14") # random.seed() function hashes strings # Simulate a binomial B(prob) x = uniform(0, 1) if x < prob: del self.vocabulary_[word] except: pass # Order vocab by frequency: self.vocabulary_ = OrderedDict( sorted(self.vocabulary_.items(), key=lambda x: x[1], reverse=True)) # Cuts if max_voc_size if self.params["vocabulary_size"] is not None: self.vocabulary_ = { k: self.vocabulary_[k] for k in self.vocabulary_.keys() [:self.params["vocabulary_size"]] } def wordphrases(self, t): """ word phrases gathering (in a single token, gathered with _ ). Args: t : str a text to clean Returns: t : str the cleaned text """ count = 0 words = t.split(" ") new_words = [] # First handling the case where the text is just one word : # cannot generate any bigram. if len(words) == 1: new_words = words # Then regular cases : else: j = 0 while j < (len(words) - 1): # = for each word in the sentence big = ( words[j], words[j + 1], ) # getting the (j-th, j+1-th)words # writing the corresponding bigram : bigrams = self.parsing_char_.join(big) # If the bigram is enough frequent to be gathered : if bigrams in self.phrasewords_: # Then add the bigram as a new word in 'new_sent_sent' new_words.append("_".join(big)) count = count + 1 # Count the number of gathered # bigrams # Directly go to the j+2-th word in order to avoid # repeating the j+1-th word j = j + 2 # If the bigram is not frequent enough : else: if j == (len(words) - 2): new_words.append(words[j]) new_words.append(words[j + 1]) j = j + 2 # Add j-th word else: new_words.append(words[j]) # Go to j+1-th word j = j + 1 return " ".join(new_words) def transform_batch(self, filebatch): """ Transforms a batch by cleaning the text, gathering word phrases, replacing subsampled words by UNK token. Args: filebatch : list of str the list of paths to the files """ for file in filebatch: new_file = os.path.join( self.params["writing_dir"], os.path.basename(file) + "_cleaned" + ".txt", ) text = openFile(file) cleaned_text = self.clean(text) del text # Words phrases gathering cleaned_text = self.wordphrases(cleaned_text) # Frequency subsampling cleaned_text = " ".join( map( lambda x: "UNK" if (x not in self.vocabulary_.keys()) else x, cleaned_text.split(" "), )) with open(new_file, "w", encoding="utf-8") as f: f.write(cleaned_text) gc.collect() def transform(self, filenames): """ Parallelizes the transformation, dumped in writing_dir Args: filenames : str or list of str the list of file names in the given batch """ if not self.fitted: logger.error("No fitting, aborting") else: logger.info("Started transform") batches = self.get_batches(filenames) logger.info("Defined {} batches for multiprocessing".format( cpu_count())) logger.info("Starting parallelized transforming") pool = Pool(processes=cpu_count()) pool.map(self.transform_batch, batches) pool.close() pool.terminate() pool.join() logger.info("Succesfully transformed all the files") def save_word_phrases(self): """Saves word phrases as a json file in log_dir """ cleaned_phrases = { k.replace(self.parsing_char_, "_"): self.phrasewords_[k] for k in self.phrasewords_.keys() } with open( os.path.join(self.log_dir, "WordPhrases.json"), "w", encoding="utf-8", ) as f: json.dump(cleaned_phrases, f) def get_summary(self): """ Writes a summary of the fitting in the log_dir """ with open(os.path.join(self.log_dir, "summary.txt"), "w", encoding="utf-8") as text: text.write("Attributes: \n-------------------- \n") text.write("len(unigram_dic_) : " + str(len(self.unigram_dic_)) + "\n" + "len(bigram_dic_) : " + str(len(self.bigram_dic_)) + "\n" + "len(phrasewords_) : " + str(len(self.phrasewords_)) + "\n" + "len(vocabulary_) : " + str(len(self.vocabulary_)) + "\n \n") text.write("Bigram Dic extract :\n-------------------\n") dico = self.bigram_dic_ head = dict([(key.replace(self.parsing_char_, "_"), dico[key]) for key in sorted(dico.keys())[len(dico) // 2:len(dico) // 2 + 20] ]) text.write(str(head)) text.write("\n\nPhrasewords Dic extract :\n-------------------\n ") dico = self.phrasewords_ head = dict([(key.replace(self.parsing_char_, "_"), dico[key]) for key in sorted(dico.keys())[len(dico) // 2:len(dico) // 2 + 20] ]) text.write(str(head))