def __init__(self, dataset, mode='by_sent'): '''dataset is a sequence list.''' self.feature_dict = LabelDictionary() self.add_features = False self.dataset = dataset self.feature_templates = {} self.reading_mode = mode #Speed up self.node_feature_cache = {} self.initial_state_feature_cache = {} self.final_state_feature_cache = {} self.edge_feature_cache = {} self.wordcluster_source = os.path.join(path_wordcluster,'output_liang_' + mode +'_5000') self.word_clusters = {} self.word_reference = uploadObject(os.path.join(path_utils,'word_dict_filtered')) # word dict from training data self.word_reference[BR] = 1 self.inner_trigger_words= {'I':{}} self.outer_trigger_words= {} self.outer_trigger_pos= {} self.inner_trigger_pos = {} if mode=='by_sent': self.FILTER_WORDS = 0.001 #self.FILTER_WORDS = 0.005 self.FILTER_POS = 0.001 else: self.FILTER_WORDS = 0.01 self.FILTER_POS = 0.001
def __init__(self, dataset, mode='by_sent'): '''dataset is a sequence list.''' self.feature_dict = LabelDictionary() self.add_features = False self.dataset = dataset self.feature_templates = {} #Speed up self.node_feature_cache = {} self.initial_state_feature_cache = {} self.final_state_feature_cache = {} self.edge_feature_cache = {} self.wordcluster_source = os.path.join( path_wordcluster, 'output_liang_' + mode + '_5000') self.word_clusters = {} self.word_reference = uploadObject( os.path.join(path_utils, 'word_dict_filtered')) # word dict from training data self.word_reference[BR] = 1 self.inner_trigger_words = {'I': {}} self.outer_trigger_words = {} self.outer_trigger_pos = {} self.inner_trigger_pos = {} if mode == 'by_sent': self.FILTER_WORDS = 0.001 #self.FILTER_WORDS = 0.005 self.FILTER_POS = 0.001 else: self.FILTER_WORDS = 0.01 self.FILTER_POS = 0.001
def __init__(self, dataset, mode='by_sent'): '''dataset is a sequence list.''' self.feature_dict = LabelDictionary() self.add_features = False self.dataset = dataset self.feature_templates = {} #Speed up self.node_feature_cache = {} self.initial_state_feature_cache = {} self.final_state_feature_cache = {} self.edge_feature_cache = {} self.wordcluster_source = os.path.join(path_wordcluster,'output_liang_' + mode +'_5000') self.word_clusters = {} self.word_reference = uploadObject(os.path.join(path_utils,'word_dict_filtered')) # word dict from training data self.word_reference[BR] = 1 self.stopwords = getStopwords(stem=True) self.inner_trigger_words= set()
def readDoc(doc_file): res = [] try: for line in open(doc_file): line = line.strip('\n').replace('<br>','') if line!='': sent = line.split(' ') res.append(sent) except: pass return res if __name__ == '__main__': model = uploadObject('sp_5_by_sent') ini = date(2014,6,1) data = core.find( {'date':{'$gte': datetime.combine(ini,datetime.min.time())} } ,timeout=False).batch_size(1000) count = 0 for dat in data: id = dat['_id'] job = tokenized.find_one({'$and':[{'_id.description':id['description'] }, {'_id.details':id['details'] } ]}) temp = job.get("carreras",[]) if temp!=[]: print("YALA") continue
data_dir = os.path.join(clustering_project_dir,'enero_ing_stem') docs_dir = os.path.join(clustering_dir,'jobs_enero/docs') ing_dir = os.path.join(clustering_project_dir,'jobs_enero/complete_docs') if not os.path.exists(data_dir): os.makedirs(data_dir) ######################################################################### client = MongoClient() db = client.JobDB db_prueba = db.prueba ######################################################################### word_dict_filtered = uploadObject(os.path.join(utils_path,'word_dict_filtered')) temp = json.loads(open('hierarchy/ident_names.json','r').read()) hierarchy = json.loads(open('hierarchy/carreras.json','r').read()) fileByName = {} for k,v in temp.items(): fileByName[v]=k ######################################################################### if __name__ == '__main__': files_ingenieria = [] for level1 in hierarchy["children"]:
######################################################################### client = MongoClient() db = client.JobDB db_prueba = db.prueba ######################################################################### temp = json.loads(open('hierarchy/ident_names.json','r').read()) hierarchy = json.loads(open('hierarchy/carreras.json','r').read()) fileByName = {} for k,v in temp.items(): fileByName[v]=k ######################################################################### ######################################################################### print("Loading models...") fun_model = uploadObject(os.path.join(funmodel_path,'prod_model')) req_model = uploadObject(os.path.join(reqmodel_path,'sp_5_by_sent')) carr_model = uploadObject(os.path.join(carrmodel_path,'prod_model')) jobarea_model = uploadObject(os.path.join(jobarea_path,'sp_5_by_doc')) def parallel_funct(docname): print(docname) doc = readDoc(os.path.join(source_dir,docname)) sequence = makeSequence_doc(doc) fun_pred,_ = fun_model.viterbi_decode_bigram(sequence) fun_pred.sequence_list.seq_list[0].y = fun_pred.y jobarea_pred,_ = jobarea_model.viterbi_decode_bigram(sequence) jobarea_pred.sequence_list.seq_list[0].y = jobarea_pred.y
def __init__(self, data, batch_size=1000, max_vocab_size=None, lower=False): self.batch_size = batch_size self.dataset = data self.lower = lower # whether to lowercase everything self.max_vocab_size = max_vocab_size # mapping from cluster IDs to cluster IDs, # to keep track of the hierarchy self.cluster_parents = {} self.cluster_counter = 0 # the list of words in the vocabulary and their counts self.counts = defaultdict(int) self.trans = defaultdict(make_int_defaultdict) self.num_tokens = 0 # the graph weights (w) and the effects of merging nodes (L) # (see Liang's thesis) self.w = defaultdict(make_float_defaultdict) self.L = defaultdict(make_float_defaultdict) # the 0/1 bit to add when walking up the hierarchy # from a word to the top-level cluster self.cluster_bits = {} # load filtered lexicon self.word_reference = uploadObject(os.path.join(path_utils,'word_dict_filtered')) # word dict from all data (200k) self.stem_reference = uploadObject(os.path.join(path_utils,'stem_dict')) # find the most frequent words self.vocab = {} self.reverse_vocab = [] self.create_vocab() # create sets of documents that each word appears in self.create_index() # make a copy of the list of words, as a queue for making new clusters word_queue = list(range(len(self.vocab))) # score potential clusters, starting with the most frequent words. # also, remove the batch from the queue self.current_batch = word_queue[:(self.batch_size + 1)] word_queue = word_queue[(self.batch_size + 1):] self.initialize_tables() while len(self.current_batch) > 1: # find the best pair of words/clusters to merge c1, c2 = self.find_best() # merge the clusters in the index self.merge(c1, c2) if word_queue: new_word = word_queue.pop(0) self.add_to_batch(new_word) logging.info('{} AND {} WERE MERGED INTO {}. {} REMAIN.' .format(self.reverse_vocab[c1] if c1 < len(self.reverse_vocab) else c1, self.reverse_vocab[c2] if c2 < len(self.reverse_vocab) else c2, self.cluster_counter, len(self.current_batch) + len(word_queue) - 1)) self.cluster_counter += 1
def readDoc(doc_file): res = [] try: for line in open(doc_file): line = line.strip('\n').replace('<br>', '') if line != '': sent = line.split(' ') res.append(sent) except: pass return res if __name__ == '__main__': model = uploadObject('sp_5_by_sent') ini = date(2014, 6, 1) data = core.find( { 'date': { '$gte': datetime.combine(ini, datetime.min.time()) } }, timeout=False).batch_size(1000) count = 0 for dat in data: id = dat['_id'] job = tokenized.find_one({