Exemplo n.º 1
0
def file_to_features(path, word_vocab, window, min_count, total_w):
    examples = []
    toktok = ToktokTokenizer()
    punckt = set(string.punctuation)
    try:
        with open(path, 'r', encoding='utf8') as f:
            for line in f:
                for sentence in sent_tokenize(line):
                    words_1 = toktok.tokenize(sentence)
                    words_2 = []
                    for i, word in enumerate(words_1):
                        word_l = word.lower()
                        if word_l not in word_vocab:
                            continue
                        if word_vocab[word_l] < min_count:
                            continue
                        if word in punckt:
                            continue
                        frequency = word_vocab[word_l] / total_w
                        number = 1 - math.sqrt(10e-5 / frequency)
                        if random.uniform(0, 1) <= number:
                            continue
                        words_2.append(word)
                    max_j = len(words_2)
                    for i, word in enumerate(words_2):
                        start = i - window if (i - window) > 0 else 0
                        to = i + window if (i + window) < max_j else max_j
                        for j in range(start, to):
                            if i == j:
                                continue
                            target = words_2[j]
                            examples.append((word, target))
    except Exception as error:
        print(error)
    return examples
Exemplo n.º 2
0
def build_vocabs(directory_path, min_count):
    """Build the word and char counter vocabularies"""
    toktok = ToktokTokenizer()
    word_vocab = Counter()
    char_vocab = Counter()
    char_vocab.update(['{', '}'])
    filenames = os.listdir(directory_path)
    filepaths = [os.path.join(directory_path, e) for e in filenames]
    for i, filepath in enumerate(filepaths):
        if i % 100 == 0:
            print('Reading file number {}'.format(i), end="\r")
        with open(filepath, 'r', encoding='utf8') as f:
            try:
                line = f.read()
                if 'numbers_' in filepath:
                    tmp = toktok.tokenize(line.lower())
                    for i in range(min_count):
                        word_vocab.update(tmp)
                else:
                    word_vocab.update(word_tokenize(line.lower()))
                char_vocab.update(line)
            except Exception as error:
                print('Error with file: {}'.format(filepath))
                print(error)
    return word_vocab, char_vocab
def tokenize(i_file, o_file):
    toktok = ToktokTokenizer()
    with open(i_file, 'r') as i_f, open(o_file, 'w') as o_f:
        for line in tqdm(i_f):
            line = line.rstrip('\n')
            tokens = toktok.tokenize(line)
            print(' '.join(tokens), file=o_f)
Exemplo n.º 4
0
def extract_wiki_fdict():
    f_count = 0
    # for each wiki table, get header name, and corresponding content
    f = open(wiki_path, 'r')
    f_dest = open(wiki_fdict_path, 'w')
    toktok = ToktokTokenizer()
    tid = 0
    pool = mp.Pool()
    for line in f:
        tid += 1
        t = json.loads(line)
        if not check_format(t):
            continue
        try:
            # header process
            header_iter = iter(t['tableHeaders'][-1])
            header_span = []
            header_content = dict()
            header_bows = dict()
            header_idx = 0
            for each_header in header_iter:
                html_desc = each_header['tdHtmlString']
                span = int(html_desc.split('colspan="')[1].split('"')[0])
                header_span.append((each_header['text'], span))
                header_content[header_idx] = []
                header_bows[header_idx] = []
                header_idx += 1
                if span != 1:
                    for skip_num in range(span - 1):
                        next(header_iter)
            # content process
            for row in t['tableData']:
                global_col_index = 0
                header_idx = 0
                for header, span in header_span:
                    for idx in range(span):
                        if row[global_col_index]['text'] != '':
                            header_content[header_idx].append(
                                row[global_col_index]['text'])
                            header_bows[header_idx].extend(
                                toktok.tokenize(row[global_col_index]['text']))
                        global_col_index += 1
                    header_idx += 1
        except:
            continue
        #combine header and features
        cols_features = pool.map(gov_data.get_col_features,
                                 list(header_content.values()))
        all_col_features = list(
            zip([each[0] for each in header_span], cols_features))
        for i in range(len(all_col_features)):
            if all_col_features[i][1]:
                all_col_features[i][1]['content'] = header_bows[i]
        all_col_features = list(filter(lambda x: x[1], all_col_features))
        f_dest.write(json.dumps({tid: all_col_features}, cls=MyEncoder) + '\n')
        print("finishing {0}".format(f_count))
        f_count += 1
Exemplo n.º 5
0
def loss_char(sentence, position):
    toktok = ToktokTokenizer()
    if sentence[position] in " ,./;'[]\<>?:{}!@#$% ^&*()":
        return sentence
    if sentence[position] == " ":
        return sentence
    if sentence[position] in toktok.tokenize(sentence):
        return sentence
    return sentence[:position] + sentence[position + 1:]
Exemplo n.º 6
0
def RemoveWords_by_tag(text):
    remove_tag_list = ['JJ', 'JJR', 'JJS', 'RBR', 'RBS']
    token = ToktokTokenizer()
    words = token.tokenize(text)
    words_tagged = nltk.pos_tag(words)
    filtered = untag([
        w for w in words_tagged if not w[1] in remove_tag_list
    ])  # Filtre les mots qui n'appartiennt pas à la catégorie à supprimer

    return ' '.join(map(str, filtered))
Exemplo n.º 7
0
def extract_wiki_features(wiki_feature_path, wiki_bow_path):
    f_count = 0
    # for each wiki table, get header name, and corresponding content
    f = open(wiki_path, 'r')
    f_dest = open(wiki_feature_path, 'w')
    f_bow = open(wiki_bow_path, 'w')
    toktok = ToktokTokenizer()
    for line in f:
        t = json.loads(line)
        if not check_format(t):
            continue
        try:
            # header process
            header_iter = iter(t['tableHeaders'][-1])
            header_span = []
            header_content = dict()
            header_bows = dict()
            header_idx = 0
            for each_header in header_iter:
                html_desc = each_header['tdHtmlString']
                span = int(html_desc.split('colspan="')[1].split('"')[0])
                header_span.append((each_header['text'], span))
                header_content[header_idx] = []
                header_bows[header_idx] = []
                header_idx += 1
                if span != 1:
                    for skip_num in range(span - 1):
                        next(header_iter)
            # content process
            for row in t['tableData']:
                global_col_index = 0
                header_idx = 0
                for header, span in header_span:
                    for idx in range(span):
                        if row[global_col_index]['text'] != '':
                            header_content[header_idx].append(
                                row[global_col_index]['text'])
                            header_bows[header_idx].extend(
                                toktok.tokenize(row[global_col_index]['text']))
                        global_col_index += 1
                    header_idx += 1
        except:
            continue
        #combine header and features
        for col, f_dict, bows in zip([each[0] for each in header_span],
                                     map(get_col_features,
                                         header_content.values()),
                                     header_bows.values()):
            if f_dict:
                f_dict['_id'] = t['_id']
                f_dest.write(json.dumps({col: f_dict}) + '\n')
                f_bow.write(json.dumps({col: bows}) + '\n')
        print("finishing {0}".format(f_count))
        f_count += 1
Exemplo n.º 8
0
def extract_gov_fdict(all_resources,
                      fdict_path=gov_data_fdict_path,
                      tid_type='cat_id',
                      restrict_resource=False):
    #extracting features:
    #table_id;label,curated_features,content;label,curated_features...
    f = open(fdict_path, 'w')
    #all_resources = gov_data.read_resources()
    all_resources = gov_data.wrong_csv(all_resources)
    all_resources = list(filter(lambda x: x.status, all_resources))
    if restrict_resource:
        all_resources = gov_data.select_resources(all_resources,
                                                  fsize=50,
                                                  rs_ct=len(all_resources))
    pool = mp.Pool()
    total = len(all_resources)
    count = 0
    toktok = ToktokTokenizer()
    for resource in all_resources:
        print("processing {0}-th resource".format(count))
        for each_data in resource.data_files:
            try:
                if tid_type == 'cat_id':
                    tid = resource.rs_id + ':' + each_data.df_id
                elif tid_type == 'path':
                    tid = resource.path + '/' + each_data.df_id
                d_path = each_data.path + '/data.csv'
                df = pd.read_csv(d_path,
                                 delimiter=',',
                                 quotechar='"',
                                 dtype=str,
                                 na_filter=True)
                cols = df.columns
                contents = [
                    df[each_col].dropna().tolist() for each_col in cols
                ]
                print("extract content finished")
                cols_features = pool.map(gov_data.get_col_features, contents)
                all_col_features = list(zip(cols, cols_features))
                for i in range(len(all_col_features)):
                    if all_col_features[i][1]:
                        all_col_features[i][1]['content'] = toktok.tokenize(
                            ' '.join(contents[i]))
                all_col_features = list(
                    filter(lambda x: x[1], all_col_features))
                f.write(
                    json.dumps({tid: all_col_features}, cls=MyEncoder) + '\n')
            except Exception as e:
                print(e)
        count += 1
        print("finish {0} out of {1}".format(count, total))
    f.close()
    return all_resources
Exemplo n.º 9
0
class Solver(AbstractSolver):
    def __init__(self):
        self.morph = morph
        self.toktok = ToktokTokenizer()
        self.bert = BertEmbedder()

    def get_num(self, text):
        lemmas = [
            self.morph.parse(word)[0].normal_form for word in self.toktok.tokenize(text)
        ]
        if "указывать" in lemmas and "предложение" in lemmas:
            w = lemmas[lemmas.index("указывать") + 1]
            d = {"один": 1, "два": 2, "три": 3, "четыре": 4, "предложение": 1}
            if w in d:
                return d[w]
        elif "указывать" in lemmas and "вариант" in lemmas:
            return 2
        return 2

    def compare_text_with_variants(self, variants):
        variant_vectors = self.bert.sentence_embedding(variants)
        predicts = []
        for i in range(0, len(variant_vectors)):
            for j in range(i + 1, len(variant_vectors)):
                sim = cosine_similarity(
                    variant_vectors[i].reshape(1, -1), variant_vectors[j].reshape(1, -1)
                ).flatten()[0]
                predicts.append(pd.DataFrame({"sim": sim, "i": i, "j": j}, index=[1]))
        predicts = pd.concat(predicts)
        indexes = predicts[predicts.sim == predicts.sim.max()][["i", "j"]].values[0]
        return sorted([str(i + 1) for i in indexes])

    def sent_split(self, text):
        reg = r"\(*\d+\)"
        return re.split(reg, text)

    def process_task(self, task):
        first_phrase, task_text = re.split(r"\(*1\)", task["text"])[:2]
        variants = [t["text"] for t in task["question"]["choices"]]
        text, task = "", ""
        if "Укажите" in task_text:
            text, task = re.split("Укажите ", task_text)
            task = "Укажите " + task
        elif "Укажите" in first_phrase:
            text, task = task_text, first_phrase
        return text, task, variants

    def predict_from_model(self, task):
        text, task, variants = self.process_task(task)
        result = self.compare_text_with_variants(variants)
        return result
Exemplo n.º 10
0
def index(request):
    global invertedIndex
    global jsonData

    output_links = []
    searchTermsReq = request.GET.get('term', '')

    print(searchTermsReq)

    tokenizer = ToktokTokenizer()

    searchTerms = tokenizer.tokenize(searchTermsReq)

    print(searchTerms)

    response = {}

    output_data = defaultdict(int)
    output_links = []

    for token in searchTerms:
        token = token.lower()
        if invertedIndex[token]['idf'] > 0.25 and len(token) > 1:
            print('Looking through high for: ' + token)
            for docFilePath in invertedIndex[token]['high']:
                tfidf = invertedIndex[token]['high'][docFilePath]
                output_data[docFilePath] += tfidf

    if (len(output_data) < 10):
        for token in searchTerms:
            token = token.lower()
            if invertedIndex[token]['idf'] > 0.25 and len(token) > 1:
                print('Looking through low for: ' + token)
                for docFilePath in invertedIndex[token]['low']:
                    tfidf = invertedIndex[token]['low'][docFilePath]
                    output_data[docFilePath] += tfidf

    output_data = sorted(output_data.items(), key=itemgetter(1), reverse=True)

    for docFilePath, tfidf in output_data[:10]:
        output_links.append((jsonData[docFilePath], tfidf))

    output_links.sort(key=itemgetter(1), reverse=True)

    response['term'] = searchTermsReq
    response['results'] = output_links
    response['totalURLs'] = len(output_data)
    response['uniqueTokens'] = len(invertedIndex)
    response['totalDocuments'] = len(jsonData)

    return JsonResponse(response)
Exemplo n.º 11
0
def error_generator(utterance):
    toktok = ToktokTokenizer()
    length = len(utterance)
    nb = nb_of_errors_in_utterance(length) + 1
    utterance = utterance + " "

    for i in range(nb):
        length = len(utterance) - 1
        position = np.random.choice(range(length), p=(length) * [1 / (length)])
        l = len(toktok.tokenize(utterance))
        utterance_old = utterance
        nb = np.random.randint(1, 5)
        utterance = functions[nb](utterance, position)

    return utterance
Exemplo n.º 12
0
def clean_archive_data(folder):
    toktok = ToktokTokenizer()
    if not os.path.exists(f"{folder}-cleaned"):
        os.makedirs(f"{folder}-cleaned")
    for count, file in enumerate(os.listdir(f"{folder}")):
        if count % 1000 == 0:
            print(count)
        file_data = open(f"{folder}/{file}", "r").read()
        try:
            text_newspaper = toktok.tokenize(fulltext(file_data))
            text_newspaper_cleaned = clean(" ".join(text_newspaper))
            with open(f"{folder}-cleaned/{file}", "w") as output:
                output.write(text_newspaper_cleaned)
        except: # pylint: disable=W0702
            print(f"error with {file}", file=sys.stderr)
Exemplo n.º 13
0
def preprocess(data):
    X, Y = [], []
    toktok = ToktokTokenizer()
    for index, review in data.iterrows():
        if (index+1) % 100000 == 0:
            print(index+1)
        # words = nltk.word_tokenize(review['text'])
        tokens = toktok.tokenize(review['text'].lower())
        X.append(tokens)
        # X.append(nltk.word_tokenize(review['text']))
        Y.append(int(review['stars'] - 1))
        # if len(Y) == 10000:
        #     break
    df_new = pd.DataFrame({'text': X, 'stars': Y})
    return df_new
Exemplo n.º 14
0
def build_vocabs(filepath, min_count):
    """Build the word and char counter vocabularies"""
    toktok = ToktokTokenizer()
    word_vocab = Counter()
    char_vocab = Counter()
    with open(filepath, 'r', encoding='utf8') as f:
        try:
            line = f.read()
            if 'numbers_' in filepath:
                tmp = toktok.tokenize(line.lower())
                for i in range(min_count):
                    word_vocab.update(tmp)
            else:
                word_vocab.update(word_tokenize(line.lower()))
            char_vocab.update(line)
        except Exception as error:
            print('Error with file: {}'.format(filepath))
            print(error)
    return word_vocab, char_vocab
Exemplo n.º 15
0
def word_frequencies(contents):
    toktok = ToktokTokenizer()
    string_corpus = brown.raw()

    # Frequencies for each file
    list = []
    for file in contents.keys():
        print("Tokenising", file)
        tokenised = [
            toktok.tokenize(sent) for sent in sent_tokenize(string_corpus)
        ]
        fdist = Counter(chain(*tokenised))
        list.append(fdist)

    # Combine keys into one set, eliminating duplicates
    print("Making frequency distribution of all words that we care about.")
    keys = []
    for sublist in list:
        keys += sublist
    keys = set(keys)

    # Build combined frequency dict
    # Tuple of identifiers for connectives and other common words
    unwanted = ('at', 'to', 'in', 'ma', 'bez', 'ppss', 'pp$', 'dt', 'bedz',
                'hv', 'cc', 'cs', 'hvd', 'wdt', '*', 'bed', 'ber', 'be', 'np$',
                'ppo', 'pps', 'abn', 'cd', 'md', 'ben', 'ben', 'wps', 'vbd',
                'jj', 'rb', 'do', 'ql', 'dts', 'rp', 'in-tl', 'ex', 'i', 'dti',
                'dod', 'wrb', 'hvz', 'nn$')
    # This is far from the best way to do this, but I couldn't find the documentation for these identifiers
    frequencies = {}
    for key in keys:
        total = 0
        if (key[0] not in string.punctuation) and (
                key.split('/')[-1]
                not in unwanted):  # Gets rid of unwanted tokens
            for sublist in list:
                if key in sublist.keys():
                    total += sublist[key]
            frequencies[key.split('/')[0].lower()] = total
    print("Total words (that we care about): " + str(len(frequencies.keys())))

    return frequencies
Exemplo n.º 16
0
def select_DW_columns(labels):
    wl = set()
    for each in brown.words():
        each = each.lower()
        if each.isalpha() and (each not in wl):
            wl.add(each)
    DW_labels = []
    DW_idx = []
    toktok = ToktokTokenizer()
    for idx, label in enumerate(labels):
        tokens = toktok.tokenize(label)
        flag = True
        for token in tokens:
            if token.isdigit():
                continue
            elif token.lower() not in wl:
                flag = False
                break
        if flag:
            DW_labels.append(label)
            DW_idx.append(idx)
    return DW_idx, DW_labels
Exemplo n.º 17
0
class Dictionary(object):
    '''
	TODO:
	a lot of cases to handle the errors
	1. no such file file (due to download errors)
	2. have the file with data.csv, but is not csv file -> col errors
	'''
    def __init__(self):
        self.wl = set()
        for each in brown.words():
            each = each.lower()
            if each.isalpha() and (each not in self.wl):
                self.wl.add(each)
        self.toktok = ToktokTokenizer()

    def isDW(self, label):
        tokens = self.toktok.tokenize(label)
        flag = True
        for token in tokens:
            if token.lower() not in self.wl:
                flag = False
                break
        return flag
Exemplo n.º 18
0
def preprocess_advanced(data):
    X, Y = [], []
    toktok = ToktokTokenizer()
    en_stop = set(stopwords.words('english'))
    p_stemmer = PorterStemmer()
    for index, review in data.iterrows():
        if (index+1) % 100000 == 0:
            print(index+1)
        # words = nltk.word_tokenize(review['text'])
        tokens = toktok.tokenize(review['text'].lower())
        # tokens = word_tokenize(doc.lower())
        stopped_tokens = filter(lambda token: token not in en_stop, tokens)
        stemmed_tokens = map(lambda token: p_stemmer.stem(token), stopped_tokens)
        # if not return_tokens:
        #     return ' '.join(stemmed_tokens)
        # return list(stemmed_tokens)

        X.append(list(stemmed_tokens))
        # X.append(nltk.word_tokenize(review['text']))
        Y.append(int(review['stars'] - 1))
        if len(Y) == 1000:
            break
    df_new = pd.DataFrame({'text': X, 'stars': Y})
    return df_new
Exemplo n.º 19
0
class Solver(BertEmbedder):
    def __init__(self, seed=42):
        super(Solver, self).__init__()
        self.is_train_task = False
        self.morph = pymorphy2.MorphAnalyzer()
        self.toktok = ToktokTokenizer()
        self.seed = seed
        self.init_seed()

    def init_seed(self):
        random.seed(self.seed)

    def predict(self, task):
        return self.predict_from_model(task)

    def get_num(self, text):
        lemmas = [
            self.morph.parse(word)[0].normal_form
            for word in self.toktok.tokenize(text)
        ]
        if 'указывать' in lemmas and 'предложение' in lemmas:
            w = lemmas[lemmas.index('указывать') + 1]  # first
            d = {'один': 1, 'два': 2, 'три': 3, 'четыре': 4, 'предложение': 1}
            if w in d:
                return d[w]
        elif 'указывать' in lemmas and 'вариант' in lemmas:
            return 'unknown'
        return 1

    def compare_text_with_variants(self, text, variants, num=1):
        text_vector = self.sentence_embedding([text])
        variant_vectors = self.sentence_embedding(variants)
        i, predictions = 0, {}
        for j in variant_vectors:
            sim = cosine_similarity(text_vector[0].reshape(1, -1),
                                    j.reshape(1, -1)).flatten()[0]
            predictions[i] = sim * (len(variants[i])**(1 / 5))
            i += 1
        #print(1,predictions)
        #indexes = sorted(predictions.items(), key=operator.itemgetter(1), reverse=True)[:num]
        #print(2,indexes)
        #return [str(i[0] + 1) for i in indexes]
        return predictions

    def sent_split(self, text):
        reg = r'\(*\d+\)'
        return re.split(reg, text)

    def process_task(self, task):
        first_phrase, task_text = re.split(r'\(*1\)', task['text'])[:2]
        variants = [
            t['text'].replace("—", "").replace("<...>", "").replace(
                "<…>",
                "").replace(",", "").replace(".", "").replace(":", "").replace(
                    "»", "").replace("«", "").replace("-", " ")
            for t in task['question']['choices']
        ]
        text, task = "", ""
        if 'Укажите' in task_text:
            text, task = re.split('Укажите ', task_text)
            task = 'Укажите ' + task
        elif 'Укажите' in first_phrase:
            text, task = task_text.replace("—", "").replace(
                "<...>", "").replace("<…>", "").replace(",", "").replace(
                    ".", "").replace(":", "").replace("»", "").replace(
                        "«", "").replace("-", " "), first_phrase
        return text, task, variants

    def fit(self, tasks):
        pass

    def load(self, path=""):
        pass

    def save(self, path=''):
        pass

    def predict_from_model(self, task, num=2):
        #print(task["id"])
        text, task, variants = self.process_task(task)
        text = re.sub('[0-9]*\)', '', text).replace('   ',
                                                    ' ').replace('  ', ' ')
        for i, _ in enumerate(variants):
            variants[i] = re.sub('[0-9]*\)', '', variants[i])
            variants[i] = variants[i].replace('   ', ' ').replace('  ', ' ')
        #print(text)
        #print(variants)
        result = self.compare_text_with_variants(text, variants, num=num)
        text = [text]
        text.extend(variants)
        result2 = self.compare_text_with_variants2(text)
        indexes1 = sorted(result.items(),
                          key=operator.itemgetter(1),
                          reverse=True)[:num]

        #print(1,[str(i[0] + 1) for i in indexes1])
        indexes2 = sorted(result2.items(),
                          key=operator.itemgetter(1),
                          reverse=True)[-num:]
        #print(2,[str(i[0] + 1) for i in indexes2])
        symm1, symm2 = 0, 0
        for i in range(len(result)):
            symm1 += result[i]
            symm2 += result2[i]
        dif = symm2 / symm1
        for i in range(len(result)):
            #print(i+1,result[i],result2[i])
            result[i] -= result2[i] / (dif * 4)
        #print(result)
        indexes = sorted(result.items(),
                         key=operator.itemgetter(1),
                         reverse=True)[:num]
        ans = [str(i[0] + 1) for i in indexes]

        return sorted(ans)
Exemplo n.º 20
0
def create_lol_attack_ontology(lolbin_data, attack_windows):
    '''
    Takes in lolbin and attack lists and returns the resulting merged ontology
    :param lolbin_data: list of dictionaries of parsed lolbins
    :param attack_windows: list of dictionaries of parsed lolbins
    :return: merged ontology
    '''
    functions_to_attack = {'ADS' : 'T1096',
                           'Compile' : 'T1127',
                           'Create Service' : 'T1050',
                           'Start Service' : 'T1035',
                           'NTDS.dit' : 'T1003',
                           'UACBypass' : 'T1088',
                           'Download' : 'T1105'}

    toktok = ToktokTokenizer()
    ontology = {}
    for i in range(len(lolbin_data)):
        name = lolbin_data[i].get('name')
        functions = lolbin_data[i].get('functions', [])
        examples = lolbin_data[i].get('examples', [])
        lol_link = lolbin_data[i].get('link', None)
        if lol_link is None:
            lol_link = []
        short_name = name.split('.')[0]
        # clean up cases where the list of examples has comments and unrelated lines
        examples = [example for example in examples if short_name.lower() in example.lower()]
        found = False
        attack_tid_strong = set()
        attack_tid_weak = set()
        for attack in attack_windows:
            attack_name = attack.get('name')
            description = attack.get('description').lower()
            description_tokenized = toktok.tokenize(description)

            if name in description_tokenized:
                attack_tid_strong.add(attack.get('tid'))
            if short_name in description_tokenized:
                attack_tid_weak.add(attack.get('tid'))
            for function in functions:
                for k, v in functions_to_attack.items():
                    if k in function:
                        attack_tid_strong.add(v)
        ontology[name.lower()] = {'functions' : functions,
                          'examples' : examples,
                          'attack_ids_strong' : attack_tid_strong,
                          'attack_ids_weak' : attack_tid_weak,
                          'short_name' : short_name,
                          'references' : lol_link}

    # One more pass.  If all the examples for an executable, library, or script involve being invoked by a different executable, we will change the mapping
    ontology_tools = ontology.keys()
    for name, data in ontology.items():
        examples = data.get('examples')
        # Get deduped list of initial executable or tool name used in the example for each example
        tool_in_example = set()
        for example in examples:
            tokens = example.split()
            if len(tokens) > 0:
                tool_in_example.add(tokens[0].strip().lower())
        # Check if the tools listed are actually directly a MITRE ATT&CK technique.  If so, directly map to it.
        if len(tool_in_example) == 1:
            tool_name = tool_in_example.pop().strip().lower().split('.')[0]
            # the tool in the example is the only one given and it is different from the primary lolbas name
            for attack in attack_windows:
                attack_short_name = attack['name'].split('.')[0].lower()
                tid = attack['tid']
                if tool_name == attack_short_name:
                    ontology[name]['attack_ids_strong'].add(tid)

    # Exceptions:
    clear_weak = ['regsvr32.exe', 'powershell.exe', 'control.exe', 'expand.exe', 'winword.exe',
                  'explorer.exe', 'replace.exe', 'bash.exe']
    clear_strong = ['winword.exe', 'explorer.exe', 'replace.exe', 'bash.exe']
    for weak_to_clear in clear_weak:
        ontology[weak_to_clear]['attack_ids_weak'] = set()
    for strong_to_clear in clear_strong:
        ontology[strong_to_clear]['attack_ids_strong'] = set()

    ontology['powershell.exe']['attack_ids_strong'] = set(['T1086'])

    # Remove misclassifications
    try:
        ontology['sc.exe']['attack_ids_weak'].remove('T1197')
    except:
        pass
    try:
        ontology['url.dll']['attack_ids_weak'].remove('T1192')
    except:
        pass
    try:
        ontology['sc.exe']['attack_ids_strong'].remove('T1013')
    except:
        pass

    add_scripting = ['testxlst.js', 'scriptrunner.exe', 'runscripthelper.exe', 'msdeploy.exe', 'manage-bde.wsf', 'te.exe', 'cscript.exe']
    for add_script in add_scripting:
        ontology[add_script]['attack_ids_strong'].add('T1064')

    ontology['ieexec.exe']['attack_ids_strong'].add('T1105')
    ontology['msiexec.exe']['attack_ids_strong'].add('T1105')
    ontology['ieexec.exe']['functions'] = list(set(ontology['ieexec.exe']['functions']).union(['Download']))
    ontology['msiexec.exe']['functions'] = list(set(ontology['msiexec.exe']['functions']).union(['Download']))

    ### Add T1202 indirect execution
    indirect_execution = ['explorer.exe', 'dnscmd.exe', 'winword.exe', 'extexport.exe', 'vsjitdebugger.exe',
                          'csi.exe', 'hh.exe', 'appvlp.exe', 'scriptrunner.exe', 'dxcap.exe', 'ieexec.exe',
                          'openwith.exe', 'pcwrun.exe', 'msiexec.exe', 'bash.exe', 'msdeploy.exe', 'mftrace.exe']
    for indirect_exec in indirect_execution:
        ontology[indirect_exec]['attack_ids_strong'].add('T1202')

    # Combine all the strong and weak technique IDs
    for name, data in ontology.items():
        data['attack_ids'] = list(data['attack_ids_strong'].union(data['attack_ids_weak']))
        data.pop('attack_ids_strong')
        data.pop('attack_ids_weak')

    return ontology
Exemplo n.º 21
0
    def __init__(self):

        data_path = config.data_path

        ratio = config.freq_ratio

        start_vocabs = config.start_vocabs

        self.buckets = config.buckets

        print("Reading 'tasks.csv' file...")

        with open(data_path, 'r', encoding="utf-8") as f:

            reader = csv.reader(f, skipinitialspace=True)

            next(reader)

            sentences = [x[0].lower() for x in reader]

            self.sentences = sentences[:train_size]

        print("{} sentences loaded.".format(len(self.sentences)))

        # tokenize sentences

        tok = ToktokTokenizer()

        self.tokenized_sens = [tok.tokenize(sen) for sen in self.sentences]

        # clean sentences and only consider sentences with length > 1

        self.tokenized_sens = [[x for x in sen if x.isalpha()]
                               for sen in self.tokenized_sens]

        self.tokenized_sens = [
            sen for sen in self.tokenized_sens if sen != [] and len(sen) > 1
        ]

        # remove low frequency words and index them

        frequency_words = nltk.FreqDist(itertools.chain(*self.tokenized_sens))

        size = len(list(set(itertools.chain(*(self.tokenized_sens)))))

        self.vocabs = start_vocabs + [
            w[0] for w in frequency_words.most_common(int(size * ratio))
        ]

        self.vocab_size = len(self.vocabs)

        self.word_to_index = dict([(w, i) for i, w in enumerate(self.vocabs)])

        self.tokenized_sens = [[
            w if w in self.vocabs else '_unk' for w in sen
        ] for sen in self.tokenized_sens]
        # create train data

        self.x_train = [[self.word_to_index[w] for w in sen[:-1]]
                        for sen in self.tokenized_sens]

        self.y_train = [[self.word_to_index[w] for w in sen[1:]]
                        for sen in self.tokenized_sens]
Exemplo n.º 22
0
test['brand'] = le.transform(test.brand_name)
del le, train['brand_name'], test['brand_name']

# Replace the category slash
test["category_name_split"] = test["category_name"].str.replace(' ', '_')
train["category_name_split"] = train["category_name"].str.replace(' ', '_')
test["category_name_split"] = test["category_name_split"].str.replace('/', ' ')
train["category_name_split"] = train["category_name_split"].str.replace(
    '/', ' ')
train.head()
print('[{}] Finished PROCESSING CATEGORICAL DATA...'.format(time.time() -
                                                            start_time))

toktok = ToktokTokenizer()
train['name_token'] = [
    " ".join(toktok.tokenize(sent))
    for sent in train['name'].str.lower().tolist()
]
test['name_token'] = [
    " ".join(toktok.tokenize(sent))
    for sent in test['name'].str.lower().tolist()
]
print('[{}] Finished Tokenizing text...'.format(time.time() - start_time))

#PROCESS TEXT: RAW
print("Text to seq process...")
print("   Fitting tokenizer...")

import re
rgx = re.compile('[%s]' % '!"#%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
Exemplo n.º 23
0
class Solver(BertEmbedder):

    def __init__(self, seed=42):
        super(Solver, self).__init__()
        self.is_train_task = False
        self.morph = pymorphy2.MorphAnalyzer()
        self.toktok = ToktokTokenizer()
        self.seed = seed
        self.init_seed()

    def init_seed(self):
        random.seed(self.seed)

    def predict(self, task):
        return self.predict_from_model(task)

    def get_num(self, text):
        lemmas = [self.morph.parse(word)[0].normal_form for word in self.toktok.tokenize(text)]
        if 'указывать' in lemmas and 'предложение' in lemmas:
            w = lemmas[lemmas.index('указывать') + 1]  # first
            d = {'один': 1,
                 'два': 2,
                 'три': 3,
                 'четыре': 4,
                 'предложение': 1}
            if w in d:
                return d[w]
        elif 'указывать' in lemmas and 'вариант' in lemmas:
            return 'unknown'
        return 1

    def compare_text_with_variants(self, text, variants, num=1):
        text_vector = self.sentence_embedding([text])
        variant_vectors = self.sentence_embedding(variants)
        i, predictions = 0, {}
        for j in variant_vectors:
            sim = cosine_similarity(text_vector[0].reshape(1, -1), j.reshape(1, -1)).flatten()[0]
            predictions[i] = sim
            i += 1
        indexes = sorted(predictions.items(), key=operator.itemgetter(1), reverse=True)[:num]
        return sorted([str(i[0] + 1) for i in indexes])

    def sent_split(self, text):
        reg = r'\(*\d+\)'
        return re.split(reg, text)

    def process_task(self, task):
        first_phrase, task_text = re.split(r'\(*1\)', task['text'])[:2]
        variants = [t['text'] for t in task['question']['choices']]
        text, task = "", ""
        if 'Укажите' in task_text:
            text, task = re.split('Укажите ', task_text)
            task = 'Укажите ' + task
        elif 'Укажите' in first_phrase:
            text, task = task_text, first_phrase
        return text, task, variants

    def fit(self, tasks):
        pass

    def load(self, path=""):
        pass
    
    def save(self, path=''):
        pass

    def predict_from_model(self, task, num=2):
        text, task, variants = self.process_task(task)
        result = self.compare_text_with_variants(text, variants, num=num)
        return result
Exemplo n.º 24
0
class Pipeline:
    def __init__(self, stopwords: Set[str]) -> None:
        self.stopwords = stopwords
        self.ps = WordNetLemmatizer()
        self.stemmer = SnowballStemmer("english")
        self.tokenizer = ToktokTokenizer()
        self.puncuation = set(string.punctuation)
        # self.words = set(nltk.corpus.words.words())
        self.pipeline = [
            self.remove_punctuation,
            self.tokenize,
            self.lowering,
            self.remove_words,
            self.remove_stopwords,
            self.remove_digits_and_punctuation,
            self.remove_dangling_puncuation,
            self.remove_single,
            self.stemm,
            self.remove_starting_with_file,
        ]
        self.words_to_remove = set(
            "edit wookieepedia format registerr wrapup wiki sandbox click edit page link code preview button format"
            .split(" "))

    def remove_starting_with_file(
            self, document_iterable: Iterable[str]) -> Iterable[str]:
        for word in document_iterable:
            if not word.startswith("file"):
                yield word

    def remove_words(self, document_iterable: Iterable[str]) -> Iterable[str]:
        for word in document_iterable:
            if word not in self.words_to_remove:
                yield word

    def cleanup_space(self, a_string: str) -> str:
        return a_string.replace("|", " ").replace("\n", " ")

    def remove_dangling_puncuation(
            self, document_iterable: Iterable[str]) -> Iterable[str]:
        return [w.strip(string.punctuation) for w in document_iterable]

    def tokenize(self, document: str) -> Iterable[str]:
        for word in self.tokenizer.tokenize(document):
            yield word

    def remove_digits_and_punctuation(
            self, document_iterable: Iterable[str]) -> Iterable[str]:
        return [
            w for w in document_iterable
            if not all(x.isdigit() or x in self.puncuation for x in w)
        ]

    def remove_punctuation(self, document: str) -> str:
        return document.translate(str.maketrans("", "", string.punctuation))

    def remove_stopwords(self,
                         document_iterable: Iterable[str]) -> Iterable[str]:
        for word in document_iterable:
            if word_not_in_set(word, self.stopwords):
                yield word

    def remove_everything_with_digit(
            self, document_iterable: Iterable[str]) -> Iterable[str]:
        return [
            w for w in document_iterable if not any(x.isdigit() for x in w)
        ]

    def lowering(self, document_iterable: Iterable[str]) -> Iterable[str]:
        for word in document_iterable:
            yield word.lower()

    def lemmatize(self, document_iterable: Iterable[str]) -> Iterable[str]:
        for word in document_iterable:
            yield self.ps.lemmatize(word)

    def stemm(self, document_iterable: Iterable[str]) -> Iterable[str]:
        for word in document_iterable:
            yield self.stemmer.stem(word)

    def remove_single(self, document_iterable: Iterable[str]) -> Iterable[str]:
        for word in document_iterable:
            if len(word) > 1:
                yield word

    def pipe(self, document: str) -> Iterable[str]:
        ob = document
        for task in self.pipeline:
            ob = task(ob)
        return ob
Exemplo n.º 25
0
# TODO count the number of words per summary/ summary stats:
toktok = ToktokTokenizer()
num_sents_list = []
num_words_list = []
word_counter = Counter()
word_counter_lower = Counter()
for count, file in enumerate(os.listdir("/data/corpora/newser/data-final")):
    print(count)
    file_data = open(
        f"/data/corpora/newser/data-final/{file}/{file}.reference.txt",
        "r").read()
    summary_sents = sent_tokenize(file_data)
    num_sents_list.append(len(summary_sents))
    num_words = 0
    for sent in summary_sents:
        cur_words = toktok.tokenize(sent)
        word_counter.update(cur_words)
        word_counter_lower.update([word.lower() for word in cur_words])
        num_words += len(cur_words)
    num_words_list.append(num_words)
num_sents_np = np.array(num_sents_list)
num_words_np = np.array(num_words_list)

print(
    f"the average number of sentences per summary: {np.mean(num_sents_np)}\n")
print(f"the std of sentences per summary: {np.std(num_sents_np)}\n")
print(f"the average number of words per summary: {np.mean(num_words_np)}\n")
print(f"the std of words per summary: {np.std(num_words_np)}\n")

print(len(word_counter))
print(len(word_counter_lower))
Exemplo n.º 26
0
le.fit(np.hstack([train.brand_name, test.brand_name]))
train['brand'] = le.transform(train.brand_name)
test['brand'] = le.transform(test.brand_name)
del le, train['brand_name'], test['brand_name']

# Replace the category slash
test["category_name_split"] = test["category_name"].str.replace(' ', '_')
train["category_name_split"] = train["category_name"].str.replace(' ', '_')
test["category_name_split"] = test["category_name_split"].str.replace('/', ' ')
train["category_name_split"] = train["category_name_split"].str.replace('/', ' ')
train.head()
print('[{}] Finished PROCESSING CATEGORICAL DATA...'.format(time.time() - start_time))


toktok = ToktokTokenizer()
train['name_token'] = [" ".join(toktok.tokenize(sent)) for sent in train['name'].str.lower().tolist()]
test['name_token'] = [" ".join(toktok.tokenize(sent)) for sent in test['name'].str.lower().tolist()]
#train['item_description_token'] = [" ".join(toktok.tokenize(sent[:400])) for sent in train['item_description'].str.lower().tolist()]
#test['item_description_token'] = [" ".join(toktok.tokenize(sent[:400])) for sent in test['item_description'].str.lower().tolist()]
print('[{}] Finished Tokenizing text...'.format(time.time() - start_time))


#PROCESS TEXT: RAW
print("Text to seq process...")
print("   Fitting tokenizer...")


import re
rgx = re.compile('[%s]' % '!"#%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')   
                 
@jit
Exemplo n.º 27
0
class Solver(BertEmbedder):

    def __init__(self, seed=42):
        super(Solver, self).__init__()
        self.is_train_task = False
        self.morph = pymorphy2.MorphAnalyzer()
        self.toktok = ToktokTokenizer()
        self.seed = seed
        self.init_seed()

    def init_seed(self):
        random.seed(self.seed)

    def predict(self, task):
        return self.predict_from_model(task)

    def clean_text(self, text):
        newtext, logic = [], ["PREP", "CONJ", "Apro", "PRCL", "INFN", "VERB", "ADVB"]
        for token in self.toktok.tokenize(text):
            if any(tag in self.morph.parse(token)[0].tag for tag in logic):
                newtext.append(self.morph.parse(token)[0].normal_form)
        return ' '.join(newtext)

    def get_pos(self, text):
        pos, lemmas = 'word', [self.morph.parse(word)[0].normal_form for word in
                  self.toktok.tokenize(text)]
        if 'сочинительный' in lemmas:
            pos = "CCONJ"
        elif 'подчинительный' in lemmas:
            pos = "SCONJ"
        elif 'наречие' in lemmas:
            pos = "ADV"
        elif 'союзный' in lemmas:
            pos = "ADVPRO"
        elif 'местоимение' in lemmas:
            pos = "PRO"
        elif 'частица' in lemmas:
            pos = "PART"
        return pos

    def get_num(self, text):
        lemmas = [self.morph.parse(word)[0].normal_form for word in
                  self.toktok.tokenize(text)]
        if 'слово' in lemmas and 'предложение' in lemmas:
            d = {'один': 1,
                 'два': 2,
                 'три': 3,
                 'четыре': 4,
                 'первый': 1,
                 'второй': 2,
                 'третий': 3,
                 'четвертый': 4,
                 }
            for i in lemmas:
                if i in d:
                    return d[i]
        return 1

    def sent_split(self, text):
        reg = r'\(\n*\d+\n*\)'
        return re.split(reg, text)

    def compare_text_with_variants(self, word, text, variants):
        sents = self.sent_split(text)
        for sent in sents:
            lemmas = [self.morph.parse(word)[0].normal_form for word in
                  self.toktok.tokenize(text)]
            if word.lower() in lemmas:
                text = sent
        text_vector = self.sentence_embedding([text])
        variant_vectors = self.sentence_embedding(variants)
        i, predictions = 0, {}
        for j in variant_vectors:
            sim = cosine_similarity(text_vector[0].reshape(1, -1), j.reshape(1, -1)).flatten()[0]
            predictions[i] = sim
            i += 1
        indexes = sorted(predictions.items(), key=operator.itemgetter(1), reverse=True)[:1]
        return sorted([str(i[0] + 1) for i in indexes])

    def process_task(self, task):
        try:
            first_phrase, task_text = re.split(r'\(\n*1\n*\)', task['text'])
        except ValueError:
            first_phrase, task_text = ' '.join(re.split(r'\(\n*1\n*\)', task['text'])[:-1]), \
                                    re.split(r'\(\n*1\n*\)', task['text'])[-1]
        variants = [t['text'] for t in task['question']['choices']]
        text, task, word = "", "", ""
        if 'Определите' in task_text:
            text, task = re.split('Определите', task_text)
            task = 'Определите ' + task
            word = re.split('\.', re.split('значения слова ', text)[1])[0]
        elif 'Определите' in first_phrase:
            text, task = task_text, first_phrase
            word = re.split('\.', re.split('значения слова ', task)[1])[0]
        return text, task, variants, word

    def fit(self, tasks):
        pass

    def load(self, path="data/models/solver3.pkl"):
        pass
    
    def save(self, path='data/models/solver3.pkl'):
        pass
    
    def predict_from_model(self, task):
        text, task, variants, word = self.process_task(task)
        result = self.compare_text_with_variants(word, text, variants)
        return result
Exemplo n.º 28
0
    return (text)


totalText = ''
for x in df['Body']:
    ps = PreProcessing(x)
    totalText = totalText + " " + ps

from wordcloud import WordCloud
import matplotlib.pyplot as plt
wc = WordCloud(max_font_size=60).generate(totalText)
plt.figure(figsize=(16, 12))
plt.imshow(wc, interpolation="bilinear")

import nltk
freqdist = nltk.FreqDist(token.tokenize(totalText))
freqdist
plt.figure(figsize=(16, 5))
freqdist.plot(20)

totalText = ''
for x in df['Title']:
    ps = PreProcessing(x)
    totalText = totalText + " " + ps

from wordcloud import WordCloud
import matplotlib.pyplot as plt
wc = WordCloud(max_font_size=60).generate(totalText)
plt.figure(figsize=(16, 12))
plt.imshow(wc, interpolation="bilinear")
Exemplo n.º 29
0
class Solver(object):
    def __init__(self, seed=42):
        self.morph = morph
        self.mystem = Mystem()
        self.tokenizer = ToktokTokenizer()
        self.w2v = Word2vecProcessor()
        self.seed = seed
        self.init_seed()
        self.synonyms = None
        self.antonyms = None
        self.phraseology = None
        self.phraseologisms = None
        self.prep_synon = None
        self.set_f = None
        self.verbs_dict = None
        self.chasti_rechi = None
        self.set_f_2 = None
        self.is_loaded = False

    def init_seed(self):
        random.seed(self.seed)

    def lemmatize(self, text):
        return [
            self.morph.parse(word)[0].normal_form
            for word in self.tokenizer.tokenize(text.strip())
        ]

    def get_word(self, text):
        try:
            return re.split("»", re.split("«", text)[1])[0]
        except IndexError:
            return ""

    def get_pos(self, text):
        lemmas = [l for l in self.lemmatize(text) if l != " "]
        if "фразеологизм" in lemmas:
            pos = "PHR"
        elif "синоним" in lemmas:
            pos = "SYN"
        elif "антоним" in lemmas:
            pos = "ANT"
        elif "антонимический" in lemmas:
            pos = "ANT"
        elif "синонимический" in lemmas:
            pos = "SYN"
        else:
            pos = "DEF"
        return pos

    def full_intersection(self, small_lst, big_lst):
        if sum([value in big_lst for value in small_lst]) == len(small_lst):
            return True
        return False

    def sent_split(self, text):
        reg = r"\(*\n*\d+\n*\)"
        return re.split(reg, text)

    def search(self, text_lemmas, lst):
        for l in lst:
            if self.full_intersection(l, text_lemmas):
                return "".join(l)
        return ""

    def fit(self, tasks):
        pass

    @singleton
    def load(self, path="data/models/solvers/solver24"):
        self.synonyms = open(
            os.path.join(path, r"synonyms.txt"), "r", encoding="utf8"
        ).readlines()
        self.synonyms = [
            re.sub("\.", "", t.lower().strip("\n")).split(" ") for t in self.synonyms
        ]
        self.synonyms = [[t for t in l if t] for l in self.synonyms]
        self.antonyms = open(
            os.path.join(path, r"antonyms.txt"), "r", encoding="utf8"
        ).readlines()
        self.antonyms = [t.strip(" \n").split(" - ") for t in self.antonyms]
        self.phraseology = open(
            os.path.join(path, r"phraseologisms.txt"), "r", encoding="utf8",
        ).readlines()
        self.phraseology = [
            [
                l
                for l in self.lemmatize(l)
                if l not in ["\n", " ", "...", "", ",", "-", ".", "?", r" (", r"/"]
            ]
            for l in self.phraseology
        ]
        self.phraseologisms = load_pickle(os.path.join(path, "phraseologisms.pckl"))
        self.prep_synon = pd.read_csv(os.path.join(path, "prep_synonyms.csv"))
        self.sber_phraseologs = pd.read_csv(
            os.path.join(path, "prep_phraseologisms.csv")
        )
        self.set_f, self.verbs_dict, self.chasti_rechi, self.set_f_2 = load_pickle(
            os.path.join(path, "solver24.pkl")
        )
        self.is_loaded = True

    def save(self, path="data/models/solvers/solver24"):
        pass

    @staticmethod
    def parse_task(task):
        regex = "(\([0-9]{1,2}\)|\s[0-9]{1,2}\)|[.!?-][0-9]{1,2}\))"
        p1 = "из предлож[а-яё]+\s+\(?[0-9]{1,2}\)?\s*[–—−-]\s*\(?[0-9]{1,2}\)?"
        p2 = "из предлож[а-яё]+\s+\(?[0-9]{1,2}\)?\s*"

        task = task["text"].lower()
        selector = None

        if re.findall(p1, task):
            q = re.findall(p1, task)[0]
            q = q.replace("(", "")
            q = q.replace(")", "")
            task = re.sub(p1, q, task)
            numbers = re.findall("[0-9]{1,2}", q)
            selector = list(range(int(numbers[0]), int(numbers[1]) + 1))
        elif re.findall(p2, task):
            q = re.findall(p2, task)[0]
            q = q.replace("(", "")
            q = q.replace(")", "")
            q = "." + q
            task = re.sub(p2, q, task)
            numbers = re.findall("[0-9]{1,2}", q)
            selector = [int(numbers[0])]

        l = re.split("[.!?…]", task)
        l = [re.split(regex, x) for x in l]
        l = sum(l, [])
        l = [x.strip() for x in l]
        l = [x for x in l if len(x) > 0]

        text = []
        i = 0
        while i < len(l):
            line = [l[i]]
            i += 1
            while (
                re.match(regex, line[0])
                and (i < len(l))
                and not (re.match(regex, l[i]))
            ):
                line += [l[i]]
                i += 1
            text.append(line)

        question = [x[0] for x in text if not re.match(regex, x[0])]
        if len(text[-1]) > 2:
            question += text[-1][2:]
            text[-1] = text[-1][:2]

        question = " ".join(question)

        text = [(x[0], " ".join(x[1:])) for x in text]
        text = [x for x in text if re.match(regex, x[0])]
        text_df = pd.DataFrame(text)
        text_df[0] = text_df[0].map(lambda x: int(x.replace("(", "").replace(")", "")))
        if selector:
            tmp = text_df[text_df[0].isin(selector)]
            if tmp.shape[0] > 0:
                text_df = tmp
            else:
                print(">>>>> SELECTOR ERROR")
        return question, list(text_df[1])

    def lemm_and_clear(self, text, morph):
        analyze = morph.analyze(text)
        lemm_text = [
            (x["analysis"][0]["lex"] if x.get("analysis") else x["text"])
            for x in analyze
        ]
        lemm_text = [
            self.verbs_dict[x] if x in self.verbs_dict else x for x in lemm_text
        ]

        analyze = list(zip(lemm_text, [x["text"] for x in analyze]))
        lemm_text = [x for x in lemm_text if not re.match("\s+", x)]

        lemm_text = [x for x in lemm_text if re.match("\w+", x)]
        return lemm_text, analyze

    @staticmethod
    def find_subarray(arr1, anal_arr2):
        arr2 = [x[0] for x in anal_arr2]
        sourse_arr2 = [x[1] for x in anal_arr2]
        for i_arr2 in range(len(arr2) - 1, -1, -1):
            positions = []
            last_positions = 0
            for j_arr1, word1 in enumerate(arr1):
                for j_arr2, word2 in enumerate(arr2[i_arr2:]):
                    if (word1 == word2) and (last_positions <= j_arr2):
                        last_positions = j_arr2
                        positions.append(j_arr2)
                        break
                if len(arr1) == len(positions):
                    return sourse_arr2[i_arr2:][positions[0] : positions[-1] + 1]

    def suggest_prediction(self, task):
        question_task, text_task = self.parse_task(task)

        question_task_re = re.sub("[^а-яё]", "", question_task.lower())

        if "фразеологизм" in question_task_re:
            lemm_text_task = [self.lemm_and_clear(x, self.mystem) for x in text_task]
            for num_source in range(0, self.phraseologisms[1].max() + 1):
                for seq, annotated_seq in lemm_text_task:
                    for i in range(0, len(seq)):
                        for j in range(1, self.phraseologisms[2].map(len).max() + 1):
                            if (i + j) <= len(seq):
                                if any(
                                    [
                                        set(seq[i : i + j]) == set_f
                                        for set_f in self.phraseologisms[
                                            self.phraseologisms[1] == num_source
                                        ][3]
                                    ]
                                ):
                                    find_elements = seq[i : i + j]
                                    return (
                                        "".join(
                                            self.find_subarray(
                                                find_elements, annotated_seq
                                            )
                                        )
                                        .lower()
                                        .replace(" ", "")
                                    )

        elif "синоним" in question_task_re:
            if type(text_task) == list:
                text_task = " ".join(text_task)
            norm_text_task = self.lemm_and_clear(text_task, self.mystem)

            if "синонимкслов" in question_task_re:
                word = re.findall(r"(?<=к слову).*", question_task)[0]
                words = re.findall("\w+", word)
                words = [x.lower() for x in words]

                set_seq = set(norm_text_task[0])

                select_syn = self.prep_synon[
                    self.prep_synon["MAIN"].isin(words)
                    & self.prep_synon["Синоним"].isin(set_seq)
                ]
                select_syn = select_syn[select_syn["MAIN"] != select_syn["Синоним"]]
                select_syn = select_syn.sort_values("number")
                synon_result = select_syn[["MAIN", "Синоним"]].to_dict("split")["data"]

                if synon_result:
                    tmp = [x for x in synon_result if x[0] == words[0]]
                    if tmp:
                        synon_result = tmp[0]
                    else:
                        synon_result = synon_result[0]

                for norm_w, real_w in norm_text_task[1]:
                    if norm_w == synon_result[1]:
                        return real_w.lower()

            elif re.match(".*синонимич.*пар.*", question_task_re) or (
                "синонимы" in question_task_re
            ):
                result = []

                set_seq = set(norm_text_task[0])
                try:
                    select_syn = self.prep_synon[
                        self.prep_synon["prep_MAIN"].isin(set_seq)
                        & self.prep_synon["prep_Синоним"].isin(set_seq)
                    ]
                    select_syn = select_syn[
                        select_syn["prep_MAIN"] != select_syn["prep_Синоним"]
                    ]
                    select_syn = select_syn.sort_values("number")
                    synon_result = set(
                        select_syn[["prep_MAIN", "prep_Синоним"]].to_dict("split")[
                            "data"
                        ][0]
                    )

                    for norm_w, real_w in norm_text_task[1]:
                        if norm_w in synon_result:
                            result.append(real_w)
                            if len(synon_result) == len(result):
                                break
                    return "".join(result).lower()

                except:
                    pass

            result = []
            set_seq = set(norm_text_task[0])
            list_seq = list(set_seq)
            list_seq_w2v = [self.w2v.word_vector(i) for i in list_seq]
            list_seq = [x[0] for x in zip(list_seq, list_seq_w2v) if x[1] is not None]
            list_seq_w2v = [x for x in list_seq_w2v if x is not None]
            tmp = cosine_distances(np.stack(list_seq_w2v))
            for i in range(tmp.shape[0]):
                tmp[i, i] += 1000
            n1, n2 = np.unravel_index(tmp.argmin(), tmp.shape)

            synon_result = set((list_seq[n1], list_seq[n2]))

            for norm_w, real_w in norm_text_task[1]:
                if norm_w in synon_result:
                    result.append(real_w)
                    if len(synon_result) == len(result):
                        break
            return "".join(result).lower()

    def predict_from_model(self, task):
        prediction = self.suggest_prediction(task)
        if not prediction:
            task_description, sentences = self.parse_task(task)
            prediction = "".join(
                random.choices(
                    [
                        w.strip(punctuation)
                        for w in self.tokenizer.tokenize(random.choice(sentences))
                        if w not in punctuation and not w.isdigit()
                    ],
                    k=2,
                )
            )
        return prediction
Exemplo n.º 30
0
class Preprocessor(PreprocessorConfig):
    """A Preprocessor object inherits from a PreprocessorConfig object to
    initialize its parameters. Then, it does 5 things :

    1. Detects and replaces numbers/float by a generic token 'FLOAT', 'INT'
    2. Add spaces in between punctuation so that tokenisation avoids adding
    'word.' to the vocabulary instead of 'word', '.'.
    3. Lowers words
    4. Recursive word phrases detection : with a simple probabilistic rule,
    gathers the tokens 'new', york' to a single token 'new_york'.
    5. Frequency Subsampling : discards unfrequent words with a probability
    depending on their frequency.

    It works with 2 main methods, '.fit' and .'transform'. The first method
    fits the vocabulary (which implies to lower, tokenize, do the word
    phrase detection and frequency subsampling). Fitting the vocabulary implies
    to calculate word frequencies over all the corpus, which can be a challenge
    when parallelizing the code.
    The 'transform' method then uses the learned vocabulary to re-write clean
    files in the 'writing_dir' directory. This method is also parallelized over
    all the cpus available.

    Usage example:
    ```python
    prep = Preprocessor('/tmp/logdir')  # We suppose we already have a
    # PreprocessorConfig saved in /tmp/logdir
    prep.fit('~/mydata/')
    prep.filter()
    prep.transform('~/mydata')
    ```
    """
    def __init__(self, log_dir, from_log=False):
        self.log_dir = log_dir
        if checkExistenceFile(os.path.join(log_dir,
                                           "PreprocessorConfig.json")):
            self.read_config()
        self.tok = ToktokTokenizer()
        self.parsing_char_ = sha1(b"sally14").hexdigest()
        self.fitted = False
        if from_log:
            self.fitted = True
            with open(
                    os.path.join(self.log_dir, "vocabulary.json"),
                    "r",
                    encoding="utf-8",
            ) as f:
                self.vocabulary_ = json.load(f)
            with open(
                    os.path.join(self.log_dir, "WordPhrases.json"),
                    "r",
                    encoding="utf-8",
            ) as f:
                p = json.load(f)
                self.phrasewords_ = {
                    i.replace("_", self.parsing_char_): p[i]
                    for i in p.keys()
                }

    def get_batches(self, filenames):
        """Defines the filename batches to multiprocess fitting and transformation
        Args:
            filenames : str or list of str
                a list of files or a directory containing the files to fit/
                transform the data on.
        Returns:
            batches : list of list of str
                the list of batches (lists of filenames)
        """
        if type(filenames) == str:
            if os.path.isdir(filenames):
                ls = glob(os.path.join(filenames, "*"))
        elif type(filenames) == list:
            ls = filenames
        else:
            logger.error("Bad type for filenames, must be str or list of str")
        batches = []
        cpu = cpu_count()
        n = len(ls)
        if n >= cpu:
            for i in range(cpu - 1):
                batches.append(ls[(n // cpu) * i:(n // cpu) * (i + 1)])
            batches.append(ls[(n // cpu) * (cpu - 1):])
        else:
            batches = list(map(lambda x: [x], ls))
        assert len(batches) == min(cpu, n)
        return batches

    def fit_batch(self, filebatch):
        """
        Fits one batch
        Args:
            filebatch : list of str
                the list of file names in the given batch
        Returns:
            unig : dic
                fitted unigram dictionnary
            big : dic
                fitted bigram dictionnary
        """
        unig = {}
        big = {}
        for file in filebatch:
            text = openFile(file)
            cleaned_text = self.clean(text)
            unig = melt_vocab_dic(get_unigram_voc(cleaned_text), unig)
            big = melt_vocab_dic(
                get_bigram_voc(cleaned_text, self.parsing_char_), big)
            del text
            del cleaned_text
        return [unig, big]

    def fit(self, filenames):
        """
        Parallelizes the fitting & definition of vocabulary, dumped in
        self.log_dir
        Args:
            filenames : str or list of str
                the list of file names in the given batch
        """
        logger.info("Started fitting")
        batches = self.get_batches(filenames)
        logger.info("Defined {} batches for multiprocessing".format(
            cpu_count()))
        logger.info("Starting parallelized fitting")
        pool = Pool(processes=cpu_count())
        results = pool.map(self.fit_batch, batches)
        pool.close()
        pool.terminate()
        pool.join()
        logger.info("Received {} batches results")
        logger.info("Melting unigram and bigrams dictionnaries")
        self.unigram_dic_ = {}
        self.bigram_dic_ = {}
        for j in range(len(results)):
            self.unigram_dic_ = melt_vocab_dic(self.unigram_dic_,
                                               results[j][0])
            self.bigram_dic_ = melt_vocab_dic(self.bigram_dic_, results[j][1])
            results[j] = 0  # Clears memory
        del results
        gc.collect()
        with open(os.path.join(self.log_dir, "unigrams.json"),
                  "w",
                  encoding="utf-8") as f:
            json.dump(self.unigram_dic_, f)
        with open(os.path.join(self.log_dir, "bigrams.json"),
                  "w",
                  encoding="utf-8") as f:
            json.dump(self.bigram_dic_, f)

    def filter(self):
        """Filters the results based on the configuration, saves the
        vocabulary and the word phrases"""
        logger.info("Building word phrases score")
        with open(os.path.join(self.log_dir, "unigrams.json"),
                  "r",
                  encoding="utf-8") as f:
            self.unigram_dic_ = json.load(f)
        with open(os.path.join(self.log_dir, "bigrams.json"),
                  "r",
                  encoding="utf-8") as f:
            self.bigram_dic_ = json.load(f)
        self.build_score()
        self.phrasewords_ = {}
        self.phrasewords()
        self.vocabulary_ = {}
        self.build_vocab()
        self.wordcount2freq()
        logger.info("Subsampling unfrequent words")
        self.subsample_freq_dic()
        logger.info("Corpus fitted")
        self.fitted = True
        logger.info("Saving vocabulary")
        with open(
                os.path.join(self.log_dir, "vocabulary.json"),
                "w",
                encoding="utf-8",
        ) as f:
            json.dump(self.vocabulary_, f)
        self.save_word_phrases()
        self.get_summary()

    def clean(self, text):
        """Parses a text, tokenize, lowers and replace ints and floats by a
        special token
        Args:
            text : str
                a text represented as a string
        Returns:
            words : str
                a clean text
        """
        words = self.tok.tokenize(text)
        words = " ".join(
            map(lambda x: convertFloat(convertInt(x.lower())), words))
        return words

    def build_score(self):
        """
        Add bigram score to the 'bigram_dic_' dictionnary.
        bigram_dic_ = {bigram : occurences} becomes:
        bigram_dic_ = {bigram : (occurences, score)}
        """
        for bigrams in self.bigram_dic_.keys():
            i, j = bigrams.split(self.parsing_char_)
            score = (self.bigram_dic_[bigrams] - self.params["phrases_delta"]
                     ) / (self.unigram_dic_[i] * self.unigram_dic_[j])
            self.bigram_dic_[bigrams] = (self.bigram_dic_[bigrams], score)

    def build_vocab(self):
        """
        Create a dictionnary 'vocabulary_' which contains unigrams and word
        phrases, with their occurences.
        """
        copy_dict = self.unigram_dic_.copy()
        for word in self.bigram_dic_:
            # First feed the vocabulary with bigrams :
            if word in self.phrasewords_:
                try:
                    i, j = (word.replace(self.parsing_char_, " ", 1)).split()
                    # delete unigrams if unigrams only appear in a given bigram
                    if self.unigram_dic_[i] == self.phrasewords_[word]:
                        try:
                            # Delete element from copy_dict and not
                            # unigram_dic_
                            del copy_dict[i]
                        except:
                            pass
                    if self.unigram_dic_[j] == self.phrasewords_[word]:
                        try:
                            del copy_dict[j]
                        except:
                            pass
                    self.vocabulary_[word.replace(
                        self.parsing_char_, "_")] = self.phrasewords_[word]
                except:
                    pass
        self.vocabulary_ = melt_vocab_dic(copy_dict, self.vocabulary_)

    def phrasewords(self):
        """
        Create a dictionnary 'phrasewords_' which contains word
        phrases, with their occurences.
        """
        for bigrams in self.bigram_dic_:
            if self.bigram_dic_[bigrams][1] > self.params["phrases_threshold"]:
                self.phrasewords_[bigrams] = self.bigram_dic_[bigrams][0]

    def wordcount2freq(self):
        """
        Create the 'vocab_freq_' dictionnary : goes from a vocabulary_
        dictionnary with occurences to a dictionnary of the vocabulary with
        frequencies. Useful for frenquency subsampling.
        """
        count = 0
        dico = self.vocabulary_
        dico2 = {}
        for i in dico:
            count = count + dico[i]
        for i in dico:
            newkey = i.replace(self.parsing_char_, "_", 1)
            dico2[newkey] = dico[i] / count
        self.vocab_freq_ = dico2

    def subsample_freq_dic(self):
        """
        Vocab dictionnary frequency subsampling.
        $$p = 1 - \sqrt{\frac{t}{f}}$$
        With $f$ the frequency of a given word, and $p$ probability
        to discard the word.
        """
        t = self.params["freq_threshold"]
        vocab = self.vocab_freq_
        for word in self.vocab_freq_.keys():
            try:  # In some very rare cases, doesn't work
                # Computing discarding word probability (Mik. 2013)
                freq = vocab[word]
                prob = 1 - sqrt(t / freq)
                # Simulating a uniform [0,1]
                # First initiate a random seed
                seed("sally14")  # random.seed() function hashes strings
                # Simulate a binomial B(prob)
                x = uniform(0, 1)
                if x < prob:
                    del self.vocabulary_[word]
            except:
                pass
        # Order vocab by frequency:
        self.vocabulary_ = OrderedDict(
            sorted(self.vocabulary_.items(), key=lambda x: x[1], reverse=True))
        # Cuts if max_voc_size
        if self.params["vocabulary_size"] is not None:
            self.vocabulary_ = {
                k: self.vocabulary_[k]
                for k in self.vocabulary_.keys()
                [:self.params["vocabulary_size"]]
            }

    def wordphrases(self, t):
        """
        word phrases gathering (in a single token, gathered with _ ).
        Args:
            t : str
                a text to clean
        Returns:
            t : str
                the cleaned text
        """
        count = 0
        words = t.split(" ")
        new_words = []
        # First handling the case where the text is just one word :
        # cannot generate any bigram.
        if len(words) == 1:
            new_words = words
        # Then regular cases :
        else:
            j = 0
            while j < (len(words) - 1):  # = for each word in the sentence
                big = (
                    words[j],
                    words[j + 1],
                )  # getting the (j-th, j+1-th)words
                # writing the corresponding bigram :
                bigrams = self.parsing_char_.join(big)
                # If the bigram is enough frequent to be gathered :
                if bigrams in self.phrasewords_:
                    # Then add the bigram as a new word in 'new_sent_sent'
                    new_words.append("_".join(big))
                    count = count + 1  # Count the number of gathered
                    # bigrams
                    # Directly go to the j+2-th word in order to avoid
                    # repeating the j+1-th word
                    j = j + 2
                # If the bigram is not frequent enough :
                else:
                    if j == (len(words) - 2):
                        new_words.append(words[j])
                        new_words.append(words[j + 1])
                        j = j + 2
                    # Add j-th word
                    else:
                        new_words.append(words[j])
                        # Go to j+1-th word
                        j = j + 1

        return " ".join(new_words)

    def transform_batch(self, filebatch):
        """ Transforms a batch by cleaning the text, gathering word phrases,
        replacing subsampled words by UNK token.
        Args:
            filebatch : list of str
                the list of paths to the files
        """
        for file in filebatch:
            new_file = os.path.join(
                self.params["writing_dir"],
                os.path.basename(file) + "_cleaned" + ".txt",
            )

            text = openFile(file)
            cleaned_text = self.clean(text)
            del text
            # Words phrases gathering
            cleaned_text = self.wordphrases(cleaned_text)
            # Frequency subsampling
            cleaned_text = " ".join(
                map(
                    lambda x: "UNK"
                    if (x not in self.vocabulary_.keys()) else x,
                    cleaned_text.split(" "),
                ))
            with open(new_file, "w", encoding="utf-8") as f:
                f.write(cleaned_text)
            gc.collect()

    def transform(self, filenames):
        """
        Parallelizes the transformation, dumped in writing_dir
        Args:
            filenames : str or list of str
                the list of file names in the given batch
        """
        if not self.fitted:
            logger.error("No fitting, aborting")
        else:
            logger.info("Started transform")
            batches = self.get_batches(filenames)
            logger.info("Defined {} batches for multiprocessing".format(
                cpu_count()))
            logger.info("Starting parallelized transforming")
            pool = Pool(processes=cpu_count())
            pool.map(self.transform_batch, batches)
            pool.close()
            pool.terminate()
            pool.join()
            logger.info("Succesfully transformed all the files")

    def save_word_phrases(self):
        """Saves word phrases as a json file in log_dir
        """
        cleaned_phrases = {
            k.replace(self.parsing_char_, "_"): self.phrasewords_[k]
            for k in self.phrasewords_.keys()
        }
        with open(
                os.path.join(self.log_dir, "WordPhrases.json"),
                "w",
                encoding="utf-8",
        ) as f:
            json.dump(cleaned_phrases, f)

    def get_summary(self):
        """ Writes a summary of the fitting in the log_dir
        """
        with open(os.path.join(self.log_dir, "summary.txt"),
                  "w",
                  encoding="utf-8") as text:
            text.write("Attributes: \n-------------------- \n")
            text.write("len(unigram_dic_) : " + str(len(self.unigram_dic_)) +
                       "\n" + "len(bigram_dic_) : " +
                       str(len(self.bigram_dic_)) + "\n" +
                       "len(phrasewords_) : " + str(len(self.phrasewords_)) +
                       "\n" + "len(vocabulary_) : " +
                       str(len(self.vocabulary_)) + "\n \n")
            text.write("Bigram Dic extract :\n-------------------\n")
            dico = self.bigram_dic_
            head = dict([(key.replace(self.parsing_char_, "_"), dico[key])
                         for key in sorted(dico.keys())[len(dico) //
                                                        2:len(dico) // 2 + 20]
                         ])
            text.write(str(head))
            text.write("\n\nPhrasewords Dic extract :\n-------------------\n ")
            dico = self.phrasewords_
            head = dict([(key.replace(self.parsing_char_, "_"), dico[key])
                         for key in sorted(dico.keys())[len(dico) //
                                                        2:len(dico) // 2 + 20]
                         ])
            text.write(str(head))