def submit_query_terms(self, term_list, max_url_count=15, parallel_cb=None, cached=True): #Perform queries to Search Engine APIs #This function only operates when there is no information associated with the terms, #usually before running extract_terms() # #Args: # term_list: list of search terms that are submited by user #Returns: # urls: list of urls that are returned by Search Engine chdir(self.memex_home + '/seed_crawler/seeds_generator') query = ' '.join(term_list) with open('conf/queries.txt', 'w') as f: f.write(query) if not cached: comm = "java -cp .:class:libs/commons-codec-1.9.jar BingSearch -t " + str( max_url_count) p = Popen(comm, shell=True, stdout=PIPE) output, errors = p.communicate() print output print errors call(["rm", "-rf", "html"]) call(["mkdir", "-p", "html"]) call(["rm", "-rf", "thumbnails"]) call(["mkdir", "-p", "thumbnails"]) #if sys.platform in ['darwin', 'linux2']: if sys.platform in ['darwin']: download("results.txt") else: download("results.txt", True, parallel_cb) if exists(self.memex_home + "/seed_crawler/ranking/exclude.txt"): call([ "rm", self.memex_home + "/seed_crawler/ranking/exclude.txt" ]) with open("results.txt", 'r') as f: urls = [ self.validate_url(line.strip()) for line in f.readlines() ] else: urls = term_search('query', term_list) # with open("results.txt",'w') as f: # for url in urls: # f.write(url+"\n") for url in urls: self.urls_set.add(url) return urls #Results from Search Engine
def delDomain(self, domains): for index in domains.values(): delete_index(index, self._es) ddt_terms_keys = [doc["id"] for doc in term_search("index", [index], 0, self._all, ["term"], "ddt_terms", "terms", self._es)["results"]] delete_document(ddt_terms_keys, "ddt_terms", "terms", self._es) data_dir = self._path + "/data/" data_domain = data_dir + index if isdir(data_domain): shutil.rmtree(data_domain) delete_document(domains.keys(), "config", "domains", self._es) self._crawlerModel.updateDomains()
def submit_query_terms(self, term_list, max_url_count = 15, parallel_cb = None, cached=True): #Perform queries to Search Engine APIs #This function only operates when there is no information associated with the terms, #usually before running extract_terms() # #Args: # term_list: list of search terms that are submited by user #Returns: # urls: list of urls that are returned by Search Engine chdir(self.memex_home + '/seed_crawler/seeds_generator') query = ' '.join(term_list) with open('conf/queries.txt','w') as f: f.write(query) if not cached: comm = "java -cp .:class:libs/commons-codec-1.9.jar BingSearch -t " + str(max_url_count) p=Popen(comm, shell=True, stdout=PIPE) output, errors = p.communicate() print output print errors call(["rm", "-rf", "html"]) call(["mkdir", "-p", "html"]) call(["rm", "-rf", "thumbnails"]) call(["mkdir", "-p", "thumbnails"]) #if sys.platform in ['darwin', 'linux2']: if sys.platform in ['darwin']: download("results.txt") else: download("results.txt", True, parallel_cb) if exists(self.memex_home + "/seed_crawler/ranking/exclude.txt"): call(["rm", self.memex_home + "/seed_crawler/ranking/exclude.txt"]) with open("results.txt",'r') as f: urls = [self.validate_url(line.strip()) for line in f.readlines()] else: urls = term_search('query', term_list) # with open("results.txt",'w') as f: # for url in urls: # f.write(url+"\n") for url in urls: self.urls_set.add(url) return urls #Results from Search Engine
def getTermsSummarySeedCrawler(self, opt_maxNumberOfTerms = 40, session = None): es_info = self.esInfo(session['domainId']) format = '%m/%d/%Y %H:%M %Z' if not session['fromDate'] is None: session['fromDate'] = long(CrawlerModel.convert_to_epoch(datetime.strptime(session['fromDate'], format)) * 1000) if not session['toDate'] is None: session['toDate'] = long(CrawlerModel.convert_to_epoch(datetime.strptime(session['toDate'], format)) * 1000) s_fields = { "tag": "Positive", "index": es_info['activeCrawlerIndex'], "doc_type": es_info['docType'] } pos_terms = [field['term'][0] for field in multifield_term_search(s_fields, self._capTerms, ['term'], self._termsIndex, 'terms', self._es)] s_fields["tag"]="Negative" neg_terms = [field['term'][0] for field in multifield_term_search(s_fields, self._capTerms, ['term'], self._termsIndex, 'terms', self._es)] results = term_search(es_info['mapping']['tag'], ['Relevant'], self._pagesCapTerms, ['url', es_info['mapping']['text']], es_info['activeCrawlerIndex'], es_info['docType'], self._es) pos_urls = [field["id"] for field in results] top_terms = [] top_bigrams = [] top_trigrams = [] if session['filter'] is None: urls = [] if len(pos_urls) > 0: # If positive urls are available search for more documents like them results_more_like_pos = get_more_like_this(pos_urls, ['url', es_info['mapping']["text"]], self._pagesCapTerms, es_info['activeCrawlerIndex'], es_info['docType'], self._es) results.extend(results_more_like_pos) urls = pos_urls[0:self._pagesCapTerms] + [field['id'] for field in results_more_like_pos] if not urls: # If positive urls are not available then get the most recent documents results = get_most_recent_documents(self._pagesCapTerms, es_info['mapping'], ['url',es_info['mapping']["text"]], session['filter'], es_info['activeCrawlerIndex'], es_info['docType'], self._es) urls = [field['id'] for field in results] if len(results) > 0: text = [field[es_info['mapping']["text"]][0] for field in results] if len(urls) > 0: tfidf_all = tfidf.tfidf(urls, pos_tags=self.pos_tags, mapping=es_info['mapping'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es) if pos_terms: extract_terms_all = extract_terms.extract_terms(tfidf_all) [ranked_terms, scores] = extract_terms_all.results(pos_terms) top_terms = [ term for term in ranked_terms if (term not in neg_terms)] top_terms = top_terms[0:opt_maxNumberOfTerms] else: top_terms = tfidf_all.getTopTerms(opt_maxNumberOfTerms) if len(text) > 0: [_,_,_,_,_,_,_,_,top_bigrams, top_trigrams] = get_bigrams_trigrams.get_bigrams_trigrams(text, urls, opt_maxNumberOfTerms+len(neg_terms), self.w2v, self._es) top_bigrams = [term for term in top_bigrams if term not in neg_terms] top_trigrams = [term for term in top_trigrams if term not in neg_terms] else: s_fields = { es_info['mapping']["text"]: "(" + session['filter'].replace('"','\"') + ")" } if not session['fromDate'] is None: s_fields[es_info['mapping']["timestamp"]] = "[" + str(session['fromDate']) + " TO " + str(session['toDate']) + "]" results = multifield_query_search(s_fields, self._pagesCapTerms, ["url", es_info['mapping']["text"]], es_info['activeCrawlerIndex'], es_info['docType'], self._es) ids = [field['id'] for field in results] text = [field[es_info['mapping']["text"]][0] for field in results] urls = [field[es_info['mapping']["url"]][0] for field in results] top_terms = get_significant_terms(ids, opt_maxNumberOfTerms, mapping=es_info['mapping'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es) if len(text) > 0: [_,_,_,_,_,_,_,_,top_bigrams, top_trigrams] = get_bigrams_trigrams.get_bigrams_trigrams(text, urls, opt_maxNumberOfTerms+len(neg_terms), self.w2v, self._es) top_bigrams = [term for term in top_bigrams if term not in neg_terms] top_trigrams = [term for term in top_trigrams if term not in neg_terms] s_fields = { "tag": "Custom", "index": es_info['activeCrawlerIndex'], "doc_type": es_info['docType'] } custom_terms = [field['term'][0] for field in multifield_query_search(s_fields, 500, ['term'], self._termsIndex, 'terms', self._es)] top_terms = custom_terms + top_terms if not top_terms: return [] pos_freq = {} if len(pos_urls) > 1: tfidf_pos = tfidf.tfidf(pos_urls, pos_tags=self.pos_tags, mapping=es_info['mapping'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es) [_,corpus,ttfs_pos] = tfidf_pos.getTfArray() total_pos_tf = np.sum(ttfs_pos, axis=0) total_pos = np.sum(total_pos_tf) pos_freq={} for key in top_terms: try: pos_freq[key] = (float(total_pos_tf[corpus.index(key)])/total_pos) except ValueError: pos_freq[key] = 0 else: pos_freq = { key: 0 for key in top_terms } neg_urls = [field['id'] for field in term_search(es_info['mapping']['tag'], ['Irrelevant'], self._pagesCapTerms, ['url'], es_info['activeCrawlerIndex'], es_info['docType'], self._es)] neg_freq = {} if len(neg_urls) > 1: tfidf_neg = tfidf.tfidf(neg_urls, pos_tags=self.pos_tags, mapping=es_info['mapping'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es) [_,corpus,ttfs_neg] = tfidf_neg.getTfArray() total_neg_tf = np.sum(ttfs_neg, axis=0) total_neg = np.sum(total_neg_tf) neg_freq={} for key in top_terms: try: neg_freq[key] = (float(total_neg_tf[corpus.index(key)])/total_neg) except ValueError: neg_freq[key] = 0 else: neg_freq = { key: 0 for key in top_terms } terms = [] s_fields = { "term": "", "index": es_info['activeCrawlerIndex'], "doc_type": es_info['docType'], } results = [] for term in top_terms: s_fields["term"] = term res = multifield_term_search(s_fields, self._capTerms, ['tag', 'term'], self._termsIndex, 'terms', self._es) results.extend(res) tags = {result['term'][0]: result['tag'][0] for result in results} for term in top_terms: entry = [term, pos_freq[term], neg_freq[term], []] if tags and not tags.get(term) is None: entry[3] = tags[term].split(';') terms.append(entry) for term in top_bigrams: entry = [term, 0, 0, []] terms.append(entry) for term in top_trigrams: entry = [term, 0, 0, []] terms.append(entry) return terms
def createModel(self, session): es_info = self.esInfo(session['domainId']); data_dir = environ["DDT_HOME"] + "/data/" data_crawler = data_dir + es_info['activeCrawlerIndex'] data_training = data_crawler + "/training_data/" data_negative = data_crawler + "/training_data/negative/" data_positive = data_crawler + "/training_data/positive/" if (not isdir(data_positive)): makedirs(data_positive) if (not isdir(data_negative)): makedirs(data_negative) pos_urls = [field['url'][0] for field in term_search(es_info['mapping']['tag'], ['relevant'], ['url'], es_info['activeCrawlerIndex'], 'page', self._es)] neg_urls = [field['url'][0] for field in term_search(es_info['mapping']['tag'], ['irrelevant'], ['url'], es_info['activeCrawlerIndex'], 'page', self._es)] pos_html = get_documents(pos_urls, 'url', [es_info['mapping']["html"]], es_info['activeCrawlerIndex'], es_info['docType']) neg_html = get_documents(neg_urls, 'url', [es_info['mapping']["html"]], es_info['activeCrawlerIndex'], es_info['docType']) seeds_file = data_crawler +"/seeds.txt" print "Seeds path ", seeds_file with open(seeds_file, 'w') as s: for url in pos_html: try: file_positive = data_positive + self.encode(url.encode('utf8')) print file_positive s.write(url.encode('utf8') + '\n') with open(file_positive, 'w') as f: f.write(pos_html[url][es_info['mapping']['html']][0]) except IOError: _, exc_obj, tb = exc_info() f = tb.tb_frame lineno = tb.tb_lineno filename = f.f_code.co_filename linecache.checkcache(filename) line = linecache.getline(filename, lineno, f.f_globals) print 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj) for url in neg_html: try: file_negative = data_negative + self.encode(url.encode('utf8')) with open(file_negative, 'w') as f: f.write(neg_html[url]['html'][0]) except IOError: _, exc_obj, tb = exc_info() f = tb.tb_frame lineno = tb.tb_lineno filename = f.f_code.co_filename linecache.checkcache(filename) line = linecache.getline(filename, lineno, f.f_globals) print 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj) models_dir = environ["DDT_HOME"] + "/vis/html/models/" crawlermodel_dir = models_dir + es_info['activeCrawlerIndex'] if (not isdir(models_dir)): makedirs(models_dir) if (not isdir(crawlermodel_dir)): makedirs(crawlermodel_dir) ache_home = environ['ACHE_HOME'] comm = ache_home + "/bin/ache buildModel -t " + data_training + " -o "+ crawlermodel_dir + " -c " + ache_home + "/config/stoplist.txt" p = Popen(comm, shell=True, stderr=PIPE) output, errors = p.communicate() print output print errors zip_filename = models_dir + es_info['activeCrawlerIndex'] + "_model.zip" with ZipFile(zip_filename, "w") as modelzip: if (isfile(crawlermodel_dir + "/pageclassifier.features")): print "zipping file: "+crawlermodel_dir + "/pageclassifier.features" modelzip.write(crawlermodel_dir + "/pageclassifier.features", "pageclassifier.features") if (isfile(crawlermodel_dir + "/pageclassifier.model")): print "zipping file: "+crawlermodel_dir + "/pageclassifier.model" modelzip.write(crawlermodel_dir + "/pageclassifier.model", "pageclassifier.model") if (exists(data_crawler + "/training_data/positive")): print "zipping file: "+ data_crawler + "/training_data/positive" for (dirpath, dirnames, filenames) in walk(data_crawler + "/training_data/positive"): for html_file in filenames: modelzip.write(dirpath + "/" + html_file, "training_data/positive/" + html_file) if (exists(data_crawler + "/training_data/negative")): print "zipping file: "+ data_crawler + "/training_data/negative" for (dirpath, dirnames, filenames) in walk(data_crawler + "/training_data/negative"): for html_file in filenames: modelzip.write(dirpath + "/" + html_file, "training_data/negative/" + html_file) if (isfile(data_crawler +"/seeds.txt")): print "zipping file: "+data_crawler +"/seeds.txt" modelzip.write(data_crawler +"/seeds.txt", es_info['activeCrawlerIndex'] + "_seeds.txt") chmod(zip_filename, 0o777) return "models/" + es_info['activeCrawlerIndex'] + "_model.zip"