Пример #1
0
    def run_em(self):
        sum_probs = defaultdict(lambda: 1.0)
        for i in range(10):
            print "iteration ", i
            for sentence in self.sentences:
                if(sentence.strip() == ""):
                    continue
                parsing_algo = ParsingAlgo(sentence,
			 self.dep_multinomial_holder.mult_list,
                               self.stop_multinomial_holder.mult_list)
                marginals = parsing_algo.get_marginals()
		sum_probs[i] += math.log(parsing_algo.total_potentials)
                edges = parsing_algo.hypergraph.edges
                self.update_counts(marginals, edges)

            if(sum_probs[i-1]!=1.0):
                assert sum_probs[i] > sum_probs[i-1], \
                 "The prob are %r, %r"% (sum_probs[i],  sum_probs[i-1])

            self.update_parameters()
            self.validate_multinomials(self.dep_multinomial_holder)
            self.validate_multinomials(self.stop_multinomial_holder)

	pickle_hand = PickleHandler(self.final_value_path)
	pickle_hand.write_to_pickle(self.dep_multinomial_holder.\
           mult_list, self.stop_multinomial_holder.mult_list)
	pprint.pprint(sum_probs)
Пример #2
0
 def __init__(self,
              base_url: str,
              threads_num: Optional[int] = 999,
              target_formats: Optional[str] = None) -> None:
     self.base_url = base_url
     self.threads_num = threads_num
     self.target_formats = target_formats.split(
         ',') if target_formats else target_formats
     self.pre_download_dict = dict()
     self.history_handler = PickleHandler('wallpaper.history')
     self.history_urls = self.history_handler.load()
     self.download_folder = './4chan_thread_download_folder'
     self.lock = threading.Lock()
Пример #3
0
 def __init__(self, corpus_path, initial_values_path,
              final_value_path,debug_mode):
     self.sentences = self.get_sentences(corpus_path)
     self.debug_mode = debug_mode
     self.final_value_path = final_value_path
     self.pickle_handler = PickleHandler(initial_values_path)
     dep_mult_list, stop_mult_list =\
         self.pickle_handler.init_all_dicts()
     self.stop_multinomial_holder = MultinomialHolder()
     self.stop_multinomial_holder.mult_list = stop_mult_list
     self.dep_multinomial_holder = MultinomialHolder()
     self.dep_multinomial_holder.mult_list = dep_mult_list
        self.root_val_file_name = root_val_file_name
        self.dep_creator = DepCreator()
        self.stop_cont_creator = ContStopCreator()
        np.seterr(divide='ignore', invalid='ignore')

    def sentences(self):
        sentences = []
        with open(self.harmonic_file_name,"r") as fp:
            sentences += fp.readlines()
        with open(self.root_val_file_name,"r") as fp:
            sentences += fp.readlines()
        return sentences

    def initialize_harmonic_values(self):
        sentences = self.sentences()
        for sent in sentences:
            if "attach" in sent:
                self.dep_creator.add_entry(sent)
            if "continue" in sent:
                self.stop_cont_creator.add_entry(sent)
            if "stop" in sent:
                self.stop_cont_creator.add_entry(sent)
            if "root" in sent:
                self.dep_creator.add_entry(sent)

if __name__ == "__main__":
    initializer = HarmonicInitializer("data/harmonic", "data/root_val_file.txt")
    initializer.initialize_harmonic_values()
    pickle_handler = PickleHandler("data/harmonic_values_numpy")
    pickle_handler.write_to_pickle(initializer.dep_creator.prob_attach, initializer.stop_cont_creator.prob_cont, "data/harmonic_values_numpy")
Пример #5
0
            return self.stop_cont_mult_holder[arc.head_word, arc.dir,
                                        arc.is_adj].prob[self.stop]

    # When the tuple does not have any values, 
      #it means trap to constit
        else:
            return 1

    def display(self):
        self.get_hypergraph()
        marginals = self.get_marginals()

        for edge in self.hypergraph.edges:
            print edge.label, marginals[edge.id], self.potentials[edge]

        for node in self.hypergraph.nodes:
            print node.label

        #self.c.show()

if __name__ == "__main__":
    sentence = "NN VBZ" #NN VBZ RB VBN VBN"
    pickle_handler = PickleHandler("data/dummy")
    dep_mult, stop_cont_mult = pickle_handler.init_all_dicts()
    parsing = ParsingAlgo(sentence, dep_mult, stop_cont_mult)
    parsing.get_hypergraph()
    parsing.display()
    print sentence
    depen = parsing.best_edges()
    pprint.pprint(depen)
Пример #6
0
import os
from pickle_handler import PickleHandler
from sklearn_wrapper_extractor import SklearnWrapperExtractor
from dataframe_manipulator import DataframeManipulator

if __name__ == '__main__':
    in_directory = r'E:\Corpora\PII_Directory_20190507'
    in_pickled_df = os.path.join(in_directory,
                                 'pickled_lines_transformed_001.pkl')
    out_pickled_df = None
    pickle_handler = PickleHandler(in_pickled_df, out_pickled_df)

    dfm = DataframeManipulator(pickle_handler.df)
    labels = [
        'PII|Health|condition_treatment', 'PII|Health|health_payment',
        'PII|Health|applications_and_claims',
        'PII|Employment|performance_review'
    ]
    pickle_handler.df = dfm.equalize_rows_by_label(labels)

    #Plot

    df_plot = pickle_handler.df.groupby(
        ['minimum_label'], as_index=False).count()[['minimum_label', 'tag']]
    df_plot['label'] = df_plot.apply(
        lambda x: x['minimum_label'].split('|')[2], axis=1)
    df_plot.columns = ['minimum_label', 'count', 'label']
    plt = df_plot.plot.barh(x='label', y='count')

    sklearn_wrapper_extractor = SklearnWrapperExtractor(pickle_handler.df)
    sklearn_wrapper_extractor.prepare_data()
Пример #7
0
import pandas as pd
import os
from pickle_handler import PickleHandler

if __name__ == '__main__':
    in_directory = r'E:\Corpora\PII_Directory_20190507'
    in_pickled_df = os.path.join(in_directory, 'pickled_paragraphs.pkl')
    out_pickled_df = os.path.join(in_directory,
                                  'pickled_paragraphs_transformed_001.pkl')
    pickle_handler = PickleHandler(in_pickled_df, out_pickled_df)
    pickle_handler.perform_nlp()
    #pickle_handler.set_category_ids()
    pickle_handler.save_pickle()
    print('DONE')
Пример #8
0
import os
from pickle_handler import PickleHandler
from sklearn_wrapper_extractor import SklearnWrapperExtractor

if __name__ == '__main__':
    in_directory = r'E:\Corpora\PII_Directory_20190507'
    in_pickled_df = os.path.join(in_directory, 'df_pickle_transformed_ids_001.pkl')
    out_pickled_df = None
    pickle_handler = PickleHandler(in_pickled_df, out_pickled_df)

    sklearn_wrapper_extractor = SklearnWrapperExtractor(pickle_handler.df)
    sklearn_wrapper_extractor.prepare_data()
    sklearn_wrapper_extractor.vectorize_documents_tokenized()
    #sklearn_wrapper_extractor.get_features_with_chi2()
    sklearn_wrapper_extractor.get_features_with_model()
    #sklearn_wrapper_extractor.save_features()



Пример #9
0
# import pickle # to load pickle data
import numpy as np  # for matrix multiplication
from pickle_handler import PickleHandler
import tensorflow as tf  # main tf class for data related ops
import os  # for os related tasks

# preprocess got data using pickle handler class
gotData = PickleHandler("./got.pkl")

# lenght of sequence to consider for training
len_seq = 100

# total examples per epoch to target
examples_per_seq = len(gotData.text2num) // len_seq

# make a data set using our model
char_dataset = tf.data.Dataset.from_tensor_slices(gotData.text2num)

# make a sequence of text datset
sequence = char_dataset.batch(len_seq + 1, drop_remainder=True)


# split data set into two parts
def split_input_output(chunk):
    input_text = chunk[:-1]  # input of the text
    output_text = chunk[1:]  # output of the text

    return input_text, output_text


# create a dateset mapping to split data accordingly
# load tensorflow for deep learning
import tensorflow as tf
import numpy as np  # for matrix multiplication
from pickle_handler import PickleHandler  # handle pickle data
from sys import argv
from tqdm import tqdm

filename, modelfile, outputfile = argv

# load gotdata
gotData = PickleHandler("./got.pkl")


def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels,
                                                           logits,
                                                           from_logits=True)


# define a new model
model = tf.keras.Sequential([
    # a embedding layer
    tf.keras.layers.Embedding(gotData.vocab_size(),
                              256,
                              batch_input_shape=[1, None]),
    # lstm layer
    tf.keras.layers.LSTM(
        512,
        stateful=True,
        return_sequences=True,
        recurrent_initializer="glorot_uniform",
import pandas as pd
import os
from pickle_handler import PickleHandler

if __name__ == '__main__':
    in_directory = r'E:\Corpora\PII_Directory_20190507'
    in_pickled_df = os.path.join(in_directory, 'df_pickle_001.pkl')
    out_pickled_df = os.path.join(in_directory,
                                  'df_pickle_transformed_001.pkl')
    pickle_handler = PickleHandler(in_pickled_df, out_pickled_df)
    pickle_handler.perform_nlp()
    pickle_handler.set_category_ids()
    pickle_handler.save_pickle()
Пример #12
0
class Parser:

    def __init__(self, corpus_path, initial_values_path,
                 final_value_path,debug_mode):
        self.sentences = self.get_sentences(corpus_path)
        self.debug_mode = debug_mode
        self.final_value_path = final_value_path
        self.pickle_handler = PickleHandler(initial_values_path)
        dep_mult_list, stop_mult_list =\
            self.pickle_handler.init_all_dicts()
        self.stop_multinomial_holder = MultinomialHolder()
        self.stop_multinomial_holder.mult_list = stop_mult_list
        self.dep_multinomial_holder = MultinomialHolder()
        self.dep_multinomial_holder.mult_list = dep_mult_list

    def run_em(self):
        sum_probs = defaultdict(lambda: 1.0)
        for i in range(10):
            print "iteration ", i
            for sentence in self.sentences:
                if(sentence.strip() == ""):
                    continue
                parsing_algo = ParsingAlgo(sentence,
			 self.dep_multinomial_holder.mult_list,
                               self.stop_multinomial_holder.mult_list)
                marginals = parsing_algo.get_marginals()
		sum_probs[i] += math.log(parsing_algo.total_potentials)
                edges = parsing_algo.hypergraph.edges
                self.update_counts(marginals, edges)

            if(sum_probs[i-1]!=1.0):
                assert sum_probs[i] > sum_probs[i-1], \
                 "The prob are %r, %r"% (sum_probs[i],  sum_probs[i-1])

            self.update_parameters()
            self.validate_multinomials(self.dep_multinomial_holder)
            self.validate_multinomials(self.stop_multinomial_holder)

	pickle_hand = PickleHandler(self.final_value_path)
	pickle_hand.write_to_pickle(self.dep_multinomial_holder.\
           mult_list, self.stop_multinomial_holder.mult_list)
	pprint.pprint(sum_probs)
    

    def update_counts(self, marginals, edges):
        for edge in edges:
            arc = edge.label
            if arc.is_cont and arc.modifier_word != "":
                self.stop_multinomial_holder.inc_counts(arc.is_cont,
                     (arc.head_word, arc.dir, arc.is_adj),
                                                 marginals[edge.id])
                self.dep_multinomial_holder.inc_counts(arc.\
                   modifier_word,(arc.head_word, arc.dir),
                                                 marginals[edge.id])

            if not arc.is_cont:
                self.stop_multinomial_holder.\
                 inc_counts(arc.is_cont,
                 (arc.head_word, arc.dir, arc.is_adj),
                            marginals[edge.id])

    def update_parameters(self):
	self.dep_multinomial_holder.estimate()
        self.stop_multinomial_holder.estimate()

    def get_sentences(self, file_path):
        sentences = []
        with open(file_path,"r") as fp:
            sentences = fp.readlines()
        return sentences

    def validate_multinomials(self, multinomial_holder):
        for key, mult in multinomial_holder.mult_list.iteritems():
            if(self.debug_mode):
                print key
                pprint.pprint(mult.prob)

            total = sum(mult.prob.values())
            assert round(total, 1) == 1.0 or round(total, 1) == 0 ,\
               "The mult for " + str(key) + " is not totalling to 1 "\
               + str(total)
Пример #13
0
from pickle_handler import PickleHandler  # to handle pickle files
import tensorflow as tf  # for deep learning libs
import os  # for os related work

# load data set to work
gotData = PickleHandler("./got.pkl")

# to set seq lenght for the learning
len_seq = 1000

# total num of examples
examples_per_seq = len(gotData.text2num) // len_seq

# to make a char dataset
char_dataset = tf.data.Dataset.from_tensor_slices(gotData.text2num)

# design a sequence of tensor from this char data
sequences = char_dataset.batch(len_seq + 1, drop_remainder=True)


def split_input_output(chunk):
    input_text = chunk[:-1]  # input of the text
    output_text = chunk[1:]  # output of the text

    return input_text, output_text


# prepare dataset of these sequences from this split function
dataset = sequences.map(split_input_output)

# define some dataset params
Пример #14
0
    def initialize_dep(self):
        dep_mult_holder = MultinomialHolder()
        for cond_key, mult in self.harmonic_dep_mult.iteritems():
            for prob_key in mult.prob:
                dep_mult_holder.\
                    inc_counts(prob_key, cond_key, random.random())

        dep_mult_holder.estimate()
        return dep_mult_holder

    def initialize_stop_mult_cont(self):
        stop_cont_mult_holder = MultinomialHolder()
        for cond_key, mult in self.harmonic_stop_cont_mult.iteritems():
            random_value = random.random()
            stop_cont_mult_holder.\
                    inc_counts(0, cond_key,random_value)
            stop_cont_mult_holder.\
                    inc_counts(1, cond_key,1 - random_value)

        stop_cont_mult_holder.estimate()
        return stop_cont_mult_holder

if __name__ == "__main__":
    pickle_handler = PickleHandler("data/dummy")
    dep_mult, stop_cont_mult = pickle_handler.init_all_dicts()
    random_init = RandomInitializer(dep_mult, stop_cont_mult)
    random_init.initialize_multinomials()
    pickle_handler = PickleHandler("data/random_init")
    pickle_handler.write_to_pickle(random_init.dep_mult_holder.\
          mult_list, random_init.stop_cont_mult_holder.mult_list)
Пример #15
0
      
        if(sent_acc <= 2 and len(actual_dep) >= 2):
            self.incorrect_sent.append(sentence)
            self.incorrect_dep[incorrect_dep_key] += 1

    def get_sentences(self, file_path):
        sentences = []
        with open(file_path,"r") as fp:
            sentences = fp.readlines()
        return sentences

    def write_to_file(self, file_name, data):
        with open(file_name, "wb") as fp:
            fp.writelines(("%s\n" % line for line in data))

if __name__ == "__main__":
    pickle_handler = PickleHandler("data/harmonic_final")
    dep_mult_holder, cont_stop_mult_holder =\
          pickle_handler.init_all_dicts()

    evaluator = Evaluator("data/sentences_train.txt",
       "data/dep_index_train.txt", dep_mult_holder,
                          cont_stop_mult_holder)

    evaluator.evaluate_sentences()
    evaluator.write_to_file("incorrect_sent_rule",
                           evaluator.incorrect_sent)

    with open("incorrect_dep_dict_new", "wb") as fp:
        pickle.dump(evaluator.incorrect_dep, fp)
Пример #16
0
import pandas as pd
import os
from pickle_handler import PickleHandler

if __name__ == '__main__':

    in_directory = r'E:\Corpora\PII_Directory_20190507'
    in_pickled_df = os.path.join(in_directory, 'df_pickle_transformed_001.pkl')
    out_pickled_df = os.path.join(in_directory,
                                  'df_pickle_transformed_ids_001.pkl')
    pickle_handler = PickleHandler(in_pickled_df, out_pickled_df)
    pickle_handler.set_category_ids()
Пример #17
0
from pickle_handler import PickleHandler

N_best_features = 10

ph = PickleHandler(in_pickled_df=r'E:\Corpora\PII_Directory_20190507\sklearn_features.pkl')
best_features_dict = ph.df

targets = [target for target in best_features_dict.keys()]
print(targets)

features = best_features_dict['PII|Tax|itin_tax_id'][0][-N_best_features:]

print(features)


a = 1
            sentences += fp.readlines()
        with open(self.root_val_file_name,"r") as fp:
            sentences += fp.readlines()
        return sentences

    def create_dict(self):
        sentences = self.sentences()
        for sent in sentences:
            if "attach" in sent:
                self.dep_creator.add_entry(sent)
            if "continue" in sent:
                self.stop_cont_creator.add_entry(sent)
            if "stop" in sent:
                self.stop_cont_creator.add_entry(sent)
            if "root" in sent:
                self.dep_creator.add_entry(sent)

        self.dep_creator.mult_holder.estimate()
        self.stop_cont_creator.mult_holder.estimate()

if __name__ == "__main__":
    initializer = InitDict("data/harmonic", "data/root_val_file.txt")
    initializer.create_dict()
    pickle_handler = PickleHandler("data/harmonic_values_mult")
    dep_mult_list = initializer.dep_creator.mult_holder.mult_list
    stop_cont_mult_list = initializer.stop_cont_creator.\
            mult_holder.mult_list
    pickle_handler.write_to_pickle(dep_mult_list, stop_cont_mult_list)


Пример #19
0
class ThreadsDownloader4chan:
    HEADER = {
        "user-agent":
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36",
        "cookie":
        "__cfduid=d903e3abeaca2effe91e7b839a96be7211527491373; _ga=GA1.3.1213173136.1527491373; _ga=GA1.2.2716196.1533521826; _gid=GA1.2.2067292582.1537358233; _gid=GA1.3.2067292582.1537358233; Hm_lvt_ba7c84ce230944c13900faeba642b2b4=1537359428,1537361149,1537362700,1537363469; Hm_lpvt_ba7c84ce230944c13900faeba642b2b4=1537363858"
    }
    PROXIES = {
        "http": "socks5://127.0.0.1:10808",
        'https': 'socks5://127.0.0.1:10808'
    }

    def __init__(self,
                 base_url: str,
                 threads_num: Optional[int] = 999,
                 target_formats: Optional[str] = None) -> None:
        self.base_url = base_url
        self.threads_num = threads_num
        self.target_formats = target_formats.split(
            ',') if target_formats else target_formats
        self.pre_download_dict = dict()
        self.history_handler = PickleHandler('wallpaper.history')
        self.history_urls = self.history_handler.load()
        self.download_folder = './4chan_thread_download_folder'
        self.lock = threading.Lock()

    def set_total(self, n):
        self.counter = 0
        self.total = n

    def get_process(self):
        with self.lock:
            self.counter += 1
        return f"[{self.counter}/{self.total}]"

    @retry(wait_exponential_multiplier=1000, wait_exponential_max=60000)
    def request_get_with_retry(self, url, **args):
        default_args = {
            'headers': ThreadsDownloader4chan.HEADER,
            'proxies': ThreadsDownloader4chan.PROXIES
        }
        default_args.update(args)
        default_args = {i[0]: i[1] for i in default_args.items() if i[1]}
        return requests.get(url, **default_args)

    def history_filter(self):
        self.pre_download_list = []
        for thread_name, imgs_f_name in self.pre_download_dict.items():
            temp_imgs_f_name = []
            for img_f_name in imgs_f_name:
                if img_f_name[0] in self.history_urls:
                    continue
                elif (self.target_formats
                      and img_f_name[0].lower().split('.')[-1]
                      not in self.target_formats):
                    continue
                else:
                    temp_imgs_f_name.append(img_f_name)
                    self.history_urls.add(img_f_name[0])
            if temp_imgs_f_name:
                for temp_img_f_name in temp_imgs_f_name:
                    self.pre_download_list.append(
                        [thread_name, temp_img_f_name[0], temp_img_f_name[1]])
        self.history_handler.dump(self.history_urls)

    @property
    def url_catalog2thread(self):
        if self.base_url[-1] != '/':
            url = self.base_url + '/'
        else:
            url = self.base_url
        return url.replace('catalog', 'thread')

    def get_all_thread(self, base_url):
        r = self.request_get_with_retry(base_url)
        data = json.loads(
            re.findall(r"var catalog =(.*?);var style_group =", r.text)[0])
        threads_url = list(data['threads'].keys())
        self.threads_url = [self.url_catalog2thread + i for i in threads_url]
        self.threads_url = self.threads_url[:self.threads_num]

    def parse_thread_get_img_url(self, thread_url):
        try:
            r = self.request_get_with_retry(thread_url)
            thread_name = (re.findall(
                r'<title>(.*?)</title>',
                r.text)[0].replace('|', '').replace('?', '').replace(
                    '*', '').replace('#', '').replace('\\', '').replace(
                        '<', '').replace('>', '').replace(':', '').replace(
                            '“', '').replace('/', '').split('-')[1].strip())
            print(self.get_process(), 'Parsed thread:', thread_name)
            html = etree.HTML(r.content)
            imgs = html.xpath(".//a[@class='fileThumb']/@href")
            imgs = ['https:' + i for i in imgs]
            f_name = html.xpath(".//div[@class='fileText']/a/text()")
            imgs_f_name = list(zip(imgs, f_name))
            self.pre_download_dict.update({thread_name: imgs_f_name})
        except:
            print(traceback.format_exc())

    def run(self):
        self.get_all_thread(self.base_url)
        self.set_total(len(self.threads_url))
        with ThreadPoolExecutor(8) as executor:
            executor.map(self.parse_thread_get_img_url, self.threads_url)
        self.history_filter()
        self.set_total(len(self.pre_download_list))
        with ThreadPoolExecutor(16) as executor:
            executor.map(self.downloader, self.pre_download_list)

    def downloader(self, items):
        try:
            print(self.get_process(), 'Downloading:', items)
            thread_name, img, f_name = items
            download_folder = os.path.join(self.download_folder, thread_name)
            with self.lock:
                if not os.path.exists(download_folder):
                    os.makedirs(download_folder)
            content = self.request_get_with_retry(img).content
            with open(os.path.join(download_folder, f_name), 'wb') as f:
                f.write(content)
        except:
            print(traceback.format_exc())
# load tensorflow for deep learning
import tensorflow as tf
import numpy as np  # for matrix multiplication
from pickle_handler import PickleHandler  # handle pickle data
from sys import argv
from tqdm import tqdm
filename, modelfile, outputfile = argv


# load gotdata
gotData = PickleHandler("./got.pkl")


def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(
        labels, logits, from_logits=True
    )


# define a new model
model = tf.keras.Sequential(
    [
        # a embedding layer
        tf.keras.layers.Embedding(
            gotData.vocab_size(), 256, batch_input_shape=[1, None]
        ),
        # lstm layer
        # bidirectional rnn layer
        tf.keras.layers.Bidirectional(
            tf.keras.layers.SimpleRNN(
                1024,
Пример #21
0
from pickle_handler import PickleHandler

N_best_features = 10

ph = PickleHandler(
    in_pickled_df=r'E:\Corpora\PII_Directory_20190507\features_20190528.pkl')
best_features_dict = ph.df

a = 1