Exemplo n.º 1
0
Arquivo: ocr.py Projeto: BorutFlis/OCR
 def __init__(self):
     self.vocabulary={}
     self.train_set=[]
     self.model = None
     self.representation = None
     self.doc2vec = None
     self.exemplar_vec = None
     self.p_value=0.3
     self.date_threshold=datetime.datetime(2018,1,1)
     try:
         self.texts = pickle.load(open("train.p", "rb"))
         self.test_texts= pickle.load(open("test.p","rb"))
     except (OSError, IOError) as e:
         # we call the function that read pictures in tesseract
         exit("File not found: test.p and train.p")
     wnl = WordNetLemmatizer()
     pre_processed = [utils.simple_preprocess(t) for t in self.texts]
     for i in range(len(pre_processed)):
         pre_processed[i] = [wnl.lemmatize(w) for w in pre_processed[i]]
     self.modelW2V = gensim.models.Word2Vec(pre_processed, min_count=5)
     self.feature_dict = self.get_feature_dict()
     # self.add_synonyms()
     # We create the vocabulary which we will use as features in our model
     self.vocabulary = {}
     i = 0
     for k, v in self.feature_dict.items():
         for k2 in v.keys():
             for w in self.feature_dict[k][k2]:
                 if w not in self.vocabulary.keys():
                     self.vocabulary[w] = i
                     i += 1
     sum_rep = rp.SumRepresentation(self.vocabulary, self.feature_dict)
     self.cvec = CountVectorizer(vocabulary=self.vocabulary, tokenizer=tk.LemmaTokenizer())
     self.train_set = sum_rep.fit_transform(self.texts[:75])
     model_sum = OneClassSVM(nu=0.05)
     model_sum.fit(self.train_set)
     self.model = model_sum
     self.representation = sum_rep
     d2v_train = pickle.load(open("doc2vec.p", "rb"))
     d2v = rp.Doc2Vec(d2v_train)
     self.doc2vec = d2v
     self.exemplar_vec = self.doc2vec.model.infer_vector([self.texts[1]])