def get_bag_of_words(urls): docs = get_documents(urls) bag_of_words = {} for url in docs.keys(): bof = process_text(docs[url]) bof = valid_words(bof) bag_of_words[url] = bof return bag_of_words
def getBackwardLinks(self, urls, session): es_info = self.esInfo(session['domainId']) results = field_exists("crawled_backward", [es_info['mapping']['url']], self._all, es_info['activeCrawlerIndex'], es_info['docType'], self._es) already_crawled = [result[es_info["mapping"]["url"]][0] for result in results] not_crawled = list(Set(urls).difference(already_crawled)) results = get_documents(not_crawled, es_info["mapping"]['url'], [es_info["mapping"]['url']], es_info['activeCrawlerIndex'], es_info['docType'], self._es) not_crawled_urls = [results[url][0][es_info["mapping"]["url"]][0] for url in not_crawled] chdir(environ['DDT_HOME']+'/seeds_generator') comm = "java -cp target/seeds_generator-1.0-SNAPSHOT-jar-with-dependencies.jar StartCrawl -c backward"\ " -u \"" + ",".join(not_crawled_urls) + "\"" + \ " -t " + session["pagesCap"] + \ " -i " + es_info['activeCrawlerIndex'] + \ " -d " + es_info['docType'] + \ " -s " + es_server p=Popen(comm, shell=True, stderr=PIPE) output, errors = p.communicate() print output print errors
def get_bigrams_trigrams(text=[], termCount=20, w2v=None, es=None): bigram_vectorizer = CountVectorizer(ngram_range=(2,2)) bigram_analyze = bigram_vectorizer.build_analyzer() trigram_vectorizer = CountVectorizer(ngram_range=(3,3)) trigram_analyze = trigram_vectorizer.build_analyzer() bi_results= map(lambda t: bigram_analyze(t), text) tri_results= map(lambda t: trigram_analyze(t), text) bigrams = [] bi_dict_corpus = {} for doc in bi_results: bi_dict={} for bi in doc: bi=bi.replace(' ','_') if bi in bi_dict: bi_dict[bi] = bi_dict[bi] + 1 else: bi_dict[bi] = 1 if bi_dict: # Yamuna: Removing for now as it is slow #phrases = remove_stopword_phrases(bi_dict.keys()) phrases = bi_dict.keys() if w2v.word_vec is None: results = get_documents(phrases, "term", ["term"], "word_phrase_to_vec", "terms", es) phrases = [res.lower() for res in results.keys()] else: phrases = [term for term in phrases if not w2v.get(term) is None] bi_dict_subset = {phrase: bi_dict[phrase] for phrase in phrases} if bi_dict_subset: bigrams.append(bi_dict_subset) for phrase in bi_dict_subset.keys(): if phrase in bi_dict_corpus: bi_dict_corpus[phrase] = bi_dict_corpus[phrase] + bi_dict_subset[phrase] else: bi_dict_corpus[phrase] = bi_dict_subset[phrase] trigrams = [] tri_dict_corpus = {} for doc in tri_results: tri_dict={} for tri in doc: tri=tri.replace(' ','_') if tri in tri_dict: tri_dict[tri] = tri_dict[tri] + 1 else: tri_dict[tri] = 1 if tri_dict: # Yamuna: Removing for now as it is slow #phrases = remove_stopword_phrases(tri_dict.keys()) phrases = tri_dict.keys() if w2v.word_vec is None: results = get_documents(phrases, "term", ["term"], "word_phrase_to_vec", "terms", es) phrases = [res for res in results.keys()] else: phrases = [term for term in phrases if not w2v.get(term) is None] tri_dict_subset = {phrase: tri_dict[phrase] for phrase in phrases} if tri_dict_subset: trigrams.append(tri_dict_subset) for phrase in tri_dict_subset.keys(): if phrase in tri_dict_corpus: tri_dict_corpus[phrase] = tri_dict_corpus[phrase] + tri_dict_subset[phrase] else: tri_dict_corpus[phrase] = tri_dict_subset[phrase] return bigrams, trigrams, sorted(bi_dict_corpus.items(), key=operator.itemgetter(1), reverse=True)[0:termCount], sorted(tri_dict_corpus.items(), key=operator.itemgetter(1), reverse=True)[0:termCount]
def setPagesTag(self, pages, tag, applyTagFlag, session): es_info = self.esInfo(session['domainId']) entries = {} results = get_documents(pages, 'url', [es_info['mapping']['tag']], es_info['activeCrawlerIndex'], es_info['docType'], self._es) if applyTagFlag and len(results) > 0: print '\n\napplied tag ' + tag + ' to pages' + str(pages) + '\n\n' for page in pages: if not results.get(page) is None: # pages to be tagged exist records = results[page] for record in records: entry = {} if record.get(es_info['mapping']['tag']) is None: # there are no previous tags entry[es_info['mapping']['tag']] = tag else: current_tag = record[es_info['mapping']['tag']][0] tags = [] if current_tag != '': # all previous tags were removed tags = list(set(current_tag.split(';'))) if len(tags) != 0: # previous tags exist if not tag in tags: # append new tag entry[es_info['mapping']['tag']] = ';'.join(tags)+';'+tag else: # add new tag entry[es_info['mapping']['tag']] = tag if entry: entries[record['id']] = entry elif len(results) > 0: print '\n\nremoved tag ' + tag + ' from pages' + str(pages) + '\n\n' for page in pages: if not results.get(page) is None: records = results[page] for record in records: entry = {} if not record.get(es_info['mapping']['tag']) is None: current_tag = record[es_info['mapping']['tag']][0] if tag in current_tag: tags = list(set(current_tag.split(';'))) tags.remove(tag) entry[es_info['mapping']['tag']] = ';'.join(tags) entries[record['id']] = entry if entries: update_try = 0 while (update_try < 10): try: update_document(entries, es_info['activeCrawlerIndex'], es_info['docType'], self._es) break except: update_try = update_try + 1
def createModel(self, session): es_info = self.esInfo(session['domainId']); data_dir = environ["DDT_HOME"] + "/data/" data_crawler = data_dir + es_info['activeCrawlerIndex'] data_training = data_crawler + "/training_data/" data_negative = data_crawler + "/training_data/negative/" data_positive = data_crawler + "/training_data/positive/" if (not isdir(data_positive)): makedirs(data_positive) if (not isdir(data_negative)): makedirs(data_negative) s_fields = {} query = { "wildcard": {es_info['mapping']["tag"]:"*Relevant*"} } s_fields["queries"] = [query] pos_urls = [field['url'][0] for field in multifield_term_search(s_fields, self._all, ["url", es_info['mapping']['tag']], es_info['activeCrawlerIndex'], es_info['docType'], self._es) if "irrelevant" not in field["tag"]] query = { "wildcard": {es_info['mapping']["tag"]:"*Irrelevant*"} } s_fields["queries"] = [query] neg_urls = [field['url'][0] for field in multifield_term_search(s_fields, self._all, ["url", es_info['mapping']['tag']], es_info['activeCrawlerIndex'], es_info['docType'], self._es)] pos_html = get_documents(pos_urls, 'url', [es_info['mapping']["html"]], es_info['activeCrawlerIndex'], es_info['docType']) neg_html = get_documents(neg_urls, 'url', [es_info['mapping']["html"]], es_info['activeCrawlerIndex'], es_info['docType']) seeds_file = data_crawler +"/seeds.txt" print "Seeds path ", seeds_file with open(seeds_file, 'w') as s: for url in pos_html: try: file_positive = data_positive + self.encode(url.encode('utf8')) print file_positive s.write(url.encode('utf8') + '\n') with open(file_positive, 'w') as f: f.write(pos_html[url][0][es_info['mapping']['html']][0]) except IOError: _, exc_obj, tb = exc_info() f = tb.tb_frame lineno = tb.tb_lineno filename = f.f_code.co_filename linecache.checkcache(filename) line = linecache.getline(filename, lineno, f.f_globals) print 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj) for url in neg_html: try: file_negative = data_negative + self.encode(url.encode('utf8')) with open(file_negative, 'w') as f: f.write(neg_html[url][0]['html'][0]) except IOError: _, exc_obj, tb = exc_info() f = tb.tb_frame lineno = tb.tb_lineno filename = f.f_code.co_filename linecache.checkcache(filename) line = linecache.getline(filename, lineno, f.f_globals) print 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj) models_dir = environ["DDT_HOME"] + "/vis/html/models/" crawlermodel_dir = models_dir + es_info['activeCrawlerIndex'] if (not isdir(models_dir)): makedirs(models_dir) if (not isdir(crawlermodel_dir)): makedirs(crawlermodel_dir) ache_home = environ['ACHE_HOME'] comm = ache_home + "/bin/ache buildModel -t " + data_training + " -o "+ crawlermodel_dir + " -c " + ache_home + "/config/stoplist.txt" p = Popen(comm, shell=True, stderr=PIPE) output, errors = p.communicate() print output print errors zip_filename = models_dir + es_info['activeCrawlerIndex'] + "_model.zip" with ZipFile(zip_filename, "w") as modelzip: if (isfile(crawlermodel_dir + "/pageclassifier.features")): print "zipping file: "+crawlermodel_dir + "/pageclassifier.features" modelzip.write(crawlermodel_dir + "/pageclassifier.features", "pageclassifier.features") if (isfile(crawlermodel_dir + "/pageclassifier.model")): print "zipping file: "+crawlermodel_dir + "/pageclassifier.model" modelzip.write(crawlermodel_dir + "/pageclassifier.model", "pageclassifier.model") if (exists(data_crawler + "/training_data/positive")): print "zipping file: "+ data_crawler + "/training_data/positive" for (dirpath, dirnames, filenames) in walk(data_crawler + "/training_data/positive"): for html_file in filenames: modelzip.write(dirpath + "/" + html_file, "training_data/positive/" + html_file) if (exists(data_crawler + "/training_data/negative")): print "zipping file: "+ data_crawler + "/training_data/negative" for (dirpath, dirnames, filenames) in walk(data_crawler + "/training_data/negative"): for html_file in filenames: modelzip.write(dirpath + "/" + html_file, "training_data/negative/" + html_file) if (isfile(data_crawler +"/seeds.txt")): print "zipping file: "+data_crawler +"/seeds.txt" modelzip.write(data_crawler +"/seeds.txt", es_info['activeCrawlerIndex'] + "_seeds.txt") chmod(zip_filename, 0o777) return "models/" + es_info['activeCrawlerIndex'] + "_model.zip"