def __init__(self): self.parameter = list() self.total = list() self.paramCount = 0 self.type = 'string' self.actionIndexer = Indexer(self.type) self.featureIndexer = Indexer(self.type)
def processFile(self): interpreter = Interpreter() # return all the documents present in the file output = self.path + '.bin' if isfile(output): print('loading tokens') self.index = pickle.load(open(output, 'rb')) self.indexer = Indexer(self.tokenizer, index=self.index) else: self.indexer = Indexer(self.tokenizer) file = open(self.path, 'r', encoding='utf-8', errors='ignore') maximum = os.stat(self.path).st_size # initialize the variables i = 0 progress = 0 document = [] for line in file: progress += len(line) if line == '\n': interpreter.process(self.indexer, document) document = [] else: document += [line] i += 1 if i >= 5000: i = 0 log(progress, maximum) file.close() self.index = self.indexer.index print('\nsaving tokens') pickle.dump(self.index, open(output, 'wb'))
def start_indexing(dirs_list, dirs_dicts, main_path, posting_path, to_stem, start_index, end_index, directory): dirs_dicts[directory] = None reader = ReadFile() parser = Parse(main_path) indexer = Indexer(posting_path + directory) if to_stem: parser.to_stem = True indexer.to_stem = True if not os.path.exists(posting_path + directory): os.makedirs(posting_path + directory) documents = {} i = start_index while i < end_index: docs = reader.separate_docs_in_file(main_path + '\\corpus', dirs_list[i]) j = 0 for doc_id in docs: doc_dict = parser.main_parser(docs[doc_id].text, docs[doc_id]) docs[doc_id].text = None if i == end_index - 1 and j == len(docs) - 1: indexer.finished_parse = True indexer.index_terms(doc_dict, doc_id) documents[doc_id] = docs[doc_id] j += 1 i += 1 dirs_dicts[directory] = [ indexer.post_files_lines, indexer.terms_dict, documents, reader.languages ]
def __init__(self, device): super(PostagEmbedding, self).__init__(device=device) self.indexer = Indexer( special_tokens={ '<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4 }, with_del_stopwords=False) # postag embedding の場合だけ必ずFalse datasets = Dataset().get_instance() sentences = [ nltk.pos_tag(self.indexer.tokenize(pairs[0])) for pairs in datasets['train'] ] sentences = [[pairs[1] for pairs in sentence] for sentence in sentences] for sentence in sentences: self.indexer.add_sentence(sentence, with_raw=True) self.embedding_dim = 10 self.embedding = nn.Embedding(num_embeddings=len(self.indexer), embedding_dim=self.embedding_dim, padding_idx=self.indexer.padding_index) self.embedding.to(device)
def indexDirs(): click.echo("Indexing Files....") libraries = jhandler.getLibs() indxer = Indexer(libraries) nols = indxer.index() click.echo("{} library items detected!".format(nols)) click.echo("Done Indexing Files....")
def main(): parser = util.get_dates_range_parser() args = parser.parse_args() ind = Indexer(DB_DIR) #prof1, nouns1 = get_profiles(ind, args.start) #prof2, nouns2 = get_profiles(ind, args.end) cur = ind.get_db_for_date(args.start) prof, nouns = get_profiles(ind, args.start) replys_v = set() for p in prof: replys_v |= set(prof[p].replys.keys()) m = [] for p in prof: m_i = [] for r in replys_v: if r in prof[p].replys: m_i.append(prof[p].replys[r]) else: m_i.append(0) m.append(m_i) logging.info("%s x %s" % (len(m), len(m[0]))) u, s, v = numpy.linalg.svd(m, full_matrices=False) k = 50 uk = numpy.transpose(numpy.transpose(u)[:k]) sk = s[:k] stats.create_given_tables(cur, ["noun_similarity"]) cur.execute("create table if not exists noun_sim_svd as select * from noun_similarity limit 0") p_keys = prof.keys() sims = [] for i in range(0, len(p_keys)): for j in range(i + 1, len(p_keys)): p1_ = map(lambda x: u[i][x] * sk[x] , range(0, k)) p2_ = map(lambda x: u[j][x] * sk[x] , range(0, k)) sim = numpy.dot(p1_, p2_) / (numpy.linalg.norm(p1_) * numpy.linalg.norm(p2_)) sims.append((p_keys[i], p_keys[j], sim)) if len(sims) > 20000: save_sims(cur, sims) sims = [] logging.info("Another 10k seen") save_sims(cur, sims) logging.info("done")
def interact(c): # Create a indexer object indexer = Indexer(root) while True: # Get Directory details curr_directory, folders, files = indexer.get_dir_details() print "Current Directory :", curr_directory # Send the details to the client c.send(curr_directory + "\n") for item in folders: c.send(item + '\n') c.send("\n") for item in files: c.send(item[0] + '\n' + str(item[1]) + "\n") c.send("/") # Receive response from client choice = c.recv(1024) # Is -1 disconnect if choice == "-1": print "Disconnecting from Client" return elif int(choice) <= len( folders): # Change directory if folder is selected indexer.make_choice(int(choice)) else: send_file(c, indexer.get_file_path( int(choice))) #Send file if file is selected return
def main(): # Indexer Initialization indexer = Indexer() indexer.build_dictionary() # indexer.write_dict_to_file() # classify files run_classifier(indexer)
def __init__(self): self._max_url_length = 100 self._url_list = [] self._title_list = [] self._max_stay_on_site = 100 self._current_on_site = 0 self._previous_domain = None self._max_urls_in_list = 500 self._max_new_urls_per_page = 100 self._aggressive_pruning = True self._indexer = Indexer("localhost", 9200)
def main(): parser = util.get_dates_range_parser() args = parser.parse_args() logging.info("Start") ind = Indexer(DB_DIR) grades = (1, 10, 100, 1000) data = [["date", "nouns", "tweets", "tweet_chains"] + map(lambda x: "cnt > %s" % x, grades)] print data dates = [] for date in sorted(ind.dates_dbs.keys()): if args.start is not None and date < args.start: continue if args.end is not None and date > args.end: continue cur = ind.get_db_for_date(date) tables = cur.execute( "SELECT name FROM sqlite_master WHERE type='table' and name = 'tweets_nouns'" ).fetchall() if len(tables) == 0: logging.error("No tweets_nouns for date %s" % date) continue stats.create_given_tables(cur, ["post_cnt"]) post_cnt = cur.execute("select count(*) from post_cnt").fetchone()[0] if post_cnt == 0: cur.execute( "insert or ignore into post_cnt select noun_md5, count(*) from tweets_nouns group by noun_md5" ) cnt = [date] nouns_cnt = cur.execute("select count(*) from nouns").fetchone()[0] cnt.append(nouns_cnt) tweets = cur.execute("select count(*) from tweets").fetchone()[0] cnt.append(tweets if tweets is not None else "~") tweet_chains = cur.execute( "select count(*) from tweet_chains").fetchone()[0] cnt.append(tweet_chains if tweet_chains is not None else "~") for i in grades: cnti = cur.execute( "select count(*) from (select 1 from post_cnt where post_cnt > %s group by post_md5)" % i).fetchone()[0] cnt.append("%.2f" % ((cnti + 0.0) / nouns_cnt)) data.append(cnt) for row in data: print_cols(row)
def load(self): self.indexer = Indexer(self.posting_path) if self.to_stem: self.indexer.to_stem = True self.languages = self.indexer.load() self.avg_doc_length = self.indexer.docs_avg_length self.searcher = Searcher(self.main_path, self.posting_path, self.indexer.terms_dict, self.indexer.cities_dict, self.indexer.docs_dict, self.avg_doc_length, self.to_stem, self.with_semantics) self.searcher.model = Word2Vec.load(self.posting_path + '//model.bin')
def __init__(self, numOfLayer): self.num = numOfLayer self.parent = [] self.children = [] self.handled = [] self.Indexer = Indexer() self.Processor = Processor() self.Porter = PorterStemmer() self.db = [] link = "http://www.cse.ust.hk/" self.parent.append(link)
def start(self): self.indexer = Indexer(self.posting_path) if self.to_stem: self.indexer.to_stem = True dirs_list = os.listdir(self.main_path + '\\corpus') # Create temp postings Multiprocessing dirs_dict = ParallelMain.start(self.main_path, self.posting_path, self.to_stem, dirs_list) # Merging dictionaries that were created by the processes docs = {} files_names = [] post_files_lines = [] total_length = 0 for dir in dirs_dict.keys(): tmp_docs_dict = dirs_dict[dir][2] for doc_id in tmp_docs_dict: docs[doc_id] = tmp_docs_dict[doc_id] total_length += docs[doc_id].length for lang in dirs_dict[dir][3]: self.languages.add(lang) old_post_files_lines = dirs_dict[dir][0] for i in range(0, len(old_post_files_lines)): files_names.append(dir + "\\Posting" + str(i) if not self.to_stem else dir + "\\sPosting" + str(i)) post_files_lines.append(old_post_files_lines[i]) self.avg_doc_length = total_length / len(docs) # Gets Cities that appear in the corpus i = 0 while i < len(dirs_list): self.reader.read_cities(self.main_path + '\\corpus', dirs_list[i]) i += 1 terms_dicts = [ dirs_dict["\\Postings1"][1], dirs_dict["\\Postings2"][1], dirs_dict["\\Postings3"][1], dirs_dict["\\Postings4"][1] ] terms_dict = Merge.start_merge(files_names, post_files_lines, terms_dicts, self.posting_path, self.to_stem) self.indexer.docs_avg_length = self.avg_doc_length self.indexer.terms_dict = terms_dict self.indexer.docs_dict = docs self.indexer.index_cities(self.reader.cities) self.indexer.post_pointers(self.languages)
def __init__(self): DBCrawl.connect() DBUnCrawl.connect() DBRobot.connect() DBWebPage.connect() DBPageRank.connect() DBIndexer.connect() indexedCount.connect() #DBQuery.connect() self._getDBTables() self.indexer = Indexer() self.numberOfThreads = 1 self._setNumOfThreads() self.crawlerObjs = [] self._createCrawlerObjects()
def __init__(self, device): super(AbsolutePositionalEmbedding, self).__init__(device=device) self.max_length = 150 self.indexer = Indexer(special_tokens={ '<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4 }, with_del_stopwords=self.with_del_stopwords) self.indexer.add_sentence(list(map(str, range(self.max_length))), with_raw=True) self.embedding_dim = 20 self.embedding = nn.Embedding(num_embeddings=len(self.indexer), embedding_dim=self.embedding_dim, padding_idx=self.indexer.padding_index) self.embedding.to(device)
def __init__(self, device): super(NtuaTwitterEmbedding, self).__init__(device=device) self.path = Path('../data/models/ntua-slp-semeval2018/ntua_twitter_300.txt') with self.path.open('r', encoding='utf-8-sig') as f: texts = f.readlines() headers = texts[0].strip().split(' ') contents = [text.strip().split(' ') for text in texts[1:]] vocab = [content[0] for content in contents] weights = [list(map(float, content[1:])) for content in contents] self.indexer = Indexer(special_tokens={'<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4}, with_del_stopwords=self.with_del_stopwords) for word in vocab: self.indexer.count_word(word) self.indexer.add_word(word) self.embedding_dim = int(headers[1]) special_weights = [[0.0] * self.embedding_dim] * 5 weights = torch.FloatTensor(special_weights + weights) self.embedding = nn.Embedding.from_pretrained(embeddings=weights, padding_idx=self.indexer.padding_index) self.embedding.to(device)
def main(argv): collectionFile = '' tokenizerType = '' try: opts, args = getopt.getopt(argv, "hf:t:", ["collectionFile=", "tokenizerType="]) except getopt.GetoptError: print( 'main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better>' ) sys.exit() if len(opts) != 2: print( 'main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better>' ) sys.exit() for opt, arg in opts: if opt == '-h': print( 'main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better>' ) sys.exit() elif opt in ("-f", "--collectionFile"): if not path.exists(arg): print('Incorrect path to collection file.') sys.exit() elif not path.exists(arg): print('File doesn\'t exists') sys.exit() collectionFile = arg elif opt in ("-t", "--tokenizerType"): if arg != '0' and arg != '1': print( 'Incorrect tokenizer type. Simple tokenizer: 0, Better tokenizer: 1.' ) sys.exit() tokenizerType = arg indexer = Indexer(collectionFile, tokenizerType) indexer.listTermsInOneDoc() indexer.listHighestDocFreqTerms()
def __init__(self, device): super(RawEmbedding, self).__init__(device=device) self.indexer = Indexer(special_tokens={ '<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4 }, with_del_stopwords=self.with_del_stopwords) datasets = Dataset().get_instance() sentences = [pairs[0] for pairs in datasets['train']] self.indexer.count_word_in_text(sentences) self.indexer.add_sentences(sentences) self.embedding_dim = 100 self.embedding = nn.Embedding(num_embeddings=len(self.indexer), embedding_dim=self.embedding_dim, padding_idx=self.indexer.padding_index) self.embedding.to(device)
def probe_vocabs(): datasets, tags = get_datasets() indexer = Indexer(with_preprocess=False) n_grams = [1, 2, 3] raw_texts = datasets multi_stats = {i: { 'vocabs': {tag: set() for tag in tags}, 'counts': {tag: {} for tag in tags}, 'vocabs_by_labels': {tag: {'INFORMATIVE': set(), 'UNINFORMATIVE': set()} for tag in tags}, 'counts_by_labels': {tag: {'INFORMATIVE': {}, 'UNINFORMATIVE': {}} for tag in tags}, 'ann_texts': {tag: [] for tag in tags}, 'del_texts': {tag: [] for tag in tags} } for i in n_grams} del_items = set( ['<hashtag>', '</hashtag>', '<allcaps>', '</allcaps>', '<user>', 'covid19', 'coronavirus', 'covid', '<number>', 'httpurl', 19, '19']) del_items |= set(["'", '"', ':', ';', '.', ',', '-', '!', '?', "'s", "<", ">", "(", ")", "/"]) del_items |= set(nltk_stopwords.words('english')) for n_gram in n_grams: for tag in tags: for text, label in datasets[tag]: words = indexer.text_processor.pre_process_doc(text) label = get_label_text(label) multi_stats[n_gram]['ann_texts'][tag].extend([['_'.join(words[i: i+n_gram]) for i in range(0, len(words) - n_gram + 1)]]) del_words = [word for word in words if word not in del_items] multi_stats[n_gram]['del_texts'][tag].extend([['_'.join(del_words[i: i+n_gram]) for i in range(0, len(del_words) - n_gram + 1)]]) if n_gram != 0: words = del_words for word in ['_'.join(words[i: i+n_gram]) for i in range(0, len(words) - n_gram + 1)]: multi_stats[n_gram]['vocabs'][tag].add(word) multi_stats[n_gram]['vocabs_by_labels'][tag][label].add(word) if word in multi_stats[n_gram]['counts'][tag].keys(): multi_stats[n_gram]['counts'][tag][word] += 1 else: multi_stats[n_gram]['counts'][tag][word] = 1 if word in multi_stats[n_gram]['counts_by_labels'][tag][label].keys(): multi_stats[n_gram]['counts_by_labels'][tag][label][word] += 1 else: multi_stats[n_gram]['counts_by_labels'][tag][label][word] = 1 return {'multi_stats': multi_stats, 'raw_texts': raw_texts}
def probe_sentence_length(): datasets, tags = get_datasets() counts = {tag: {} for tag in tags} counts_by_labels = {tag: {'INFORMATIVE': {}, 'UNINFORMATIVE': {}} for tag in tags} indexer = Indexer() for tag in tags: for text, label in datasets[tag]: words = indexer.text_processor.pre_process_doc(text) label = get_label_text(label) if len(words) in counts[tag].keys(): counts[tag][len(words)].append(words) else: counts[tag][len(words)] = [words] if len(words) in counts_by_labels[tag][label].keys(): counts_by_labels[tag][label][len(words)] += 1 else: counts_by_labels[tag][label][len(words)] = 1 return {'counts': counts, 'counts_by_labels': counts_by_labels}
def __init__(self, device): super(StanfordTwitterEmbedding, self).__init__(device=device) self.path = Path( '../data/models/glove.twitter.27B/glove.twitter.27B.200d.txt') with_raw_file = False if with_raw_file: with self.path.open('r', encoding='utf-8-sig') as f: texts = f.readlines() headers = [len(texts), None] vocab, weights = map( list, zip(*Parallel(n_jobs=10) ([delayed(self.get_weights)(text) for text in texts]))) with (self.path.parent / 'vocab.pkl').open('wb') as f: pickle.dump(vocab, f) with (self.path.parent / 'weights.pkl').open('wb') as f: pickle.dump(weights, f) else: with (self.path.parent / 'vocab.pkl').open('rb') as f: vocab = pickle.load(f) with (self.path.parent / 'weights.pkl').open('rb') as f: weights = pickle.load(f) self.indexer = Indexer(special_tokens={ '<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4 }, with_del_stopwords=self.with_del_stopwords) for word in vocab: self.indexer.count_word(word) self.indexer.add_word(word) self.embedding_dim = len(weights[0]) special_weights = [[0.0] * self.embedding_dim] * 5 weights = torch.FloatTensor(special_weights + weights) self.embedding = nn.Embedding.from_pretrained( embeddings=weights, padding_idx=self.indexer.padding_index) self.embedding.to(device)
def main(inputDir, outputDir): m = Indexer() files = os.listdir(inputDir) #for measuring elapsed time elapsed_time = [] start = time.time() #Loop over all files in the given directory for file in files: if file.endswith(".html"): inputFile = os.path.join(inputDir, file) m.parse(inputFile) #the tokenization happens inside this function m.mappings.append((m.doc_num, inputFile)) end = time.time() elapsed_time.append(end - start) m.writeFiles(outputDir, N=len(m.mappings)) print("Ran in {} seconds.".format(elapsed_time[-1]))
def main(): indexer = Indexer() numDocs = 0 for subdir, dirs, files in os.walk( r'C:\Users\Kevin Huynh\Projects\cs121-a3\DEV' ): ## TODO: need to update to directory's DEV folder for filename in files: filepath = subdir + os.sep + filename f = open(filepath) data = json.load(f) print(data["url"]) indexer.parse(data['content'], data['url']) numDocs += 1 indexer.compute_tdidf() sortedTokens = sorted(indexer.invertedIndex.items(), key=lambda x: x[1]["total_frequency"], reverse=True) print("Number of Documents: {}".format(numDocs)) print("Number of Unique Tokens: {}".format( len(indexer.invertedIndex.keys()))) # file1 = open("index.txt", "a") for k, v in sortedTokens: postings = v["postings"] if len(k) < 2: filename = k + ".txt" else: filename = k[:2] + ".txt" file = open("indexes/" + filename, "a") sorted_postings = sorted(postings.items(), key=lambda x: x[1], reverse=True) file.write("{}:{}\n".format(k, sorted_postings)) file.close()
def main(): indexer = Indexer() numDocs = 0 for subdir, dirs, files in os.walk( r'C:\Users\Justin Ho\Documents\CS 121\developer\DEV'): for filename in files: filepath = subdir + os.sep + filename f = open(filepath) data = json.load(f) print(data["url"]) indexer.parse(data['content'], data['url']) numDocs += 1 sortedTokens = sorted(indexer.invertedIndex.items(), key=lambda x: x[1]["tf-idf"], reverse=True) print("Number of Documents: {}".format(numDocs)) print("Number of Unique Tokens: {}".format( len(indexer.invertedIndex.keys()))) file1 = open("index.txt", "a") for k, v in sortedTokens: file1.write("{}:{}\n".format(k, v)) file1.close()
from Model import * from Indexer import Indexer DB.connect() """RUN this only once when creating table!""" #DB.create_tables([IndexerTable]) test = Indexer() zidan = test.addToIndex("zidan", "http://www.ZidanMusk.com") osmium = test.addToIndex("osmium", "http://www.Osmium.com") abdo = test.addToIndex("abdo", "http://www.Abdo.com") words = ["zidan", "osmium", "abdo", "musk"] searchW = test.lookupWithWords(words) pages = [ "http://www.ZidanMusk.com", "http://www.Osmium.com", "http://www.Abdo.comNOOO" ] searchP = test.lookupWithPages(pages) print(searchW) print(searchP) DB.close()
# Add a required original file name. parser.add_argument('original', help='The Original text file name.') # Add a required preprocesed file name. parser.add_argument('preprocessed', help='The Preprocessed file for building the index.') # Add an optional map used argument. The 'dest' arg is how # it will be referred to inside the parser. parser.add_argument('--map', help='The requested multimap data structure.', default='avl') # Add an optional Index file name. The 'dest' arg is how # it will be referred to inside the parser. parser.add_argument('--index', dest='index', help='The optional Index file name.') # If no input use supplied, that will be handled by argparse, # since it was a required argument. # Get the args for use. args = parser.parse_args() if args.index: myIndexer = Indexer(args.original,args.preprocessed, args.map,args.index) else: myIndexer = Indexer(args.original,args.preprocessed,args.map) myIndexer.index() myIndexer.UserInterface()
from Frontier import Frontier from PageRanker import PageRanker from Indexer import Indexer from Searcher import Searcher import re frontier = Frontier() pageRanker = PageRanker() indexer = Indexer() seedDocuments = [ 'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d01.html', 'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d06.html', 'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d08.html' ] def printWebGraph(webGraph): print print '-*( Web Graph )*-' print for entry in sorted(webGraph.keys()): print entry + ' -> ' + ', '.join(webGraph[entry]) def printIndex(index): print print '-*( Indices )*-' print for term, occurences in sorted(index.iteritems()): print '(' + term[0] + ', df:' + str(term[1]) + ') ->',
from Indexer import Indexer import argparse import sys if __name__ == '__main__': parser = argparse.ArgumentParser(description='Index words in ' 'a preprocessed document and searchs for requested keywords.') parser.add_argument('original', help='Orginial text document') parser.add_argument('preprocessed', help='Preprocessed text document') parser.add_argument('--index ', dest='indexed', help='Writes list of ' 'index words into the given file on quit.') parser.add_argument('--map ', dest='map_type', help='Uses given' 'data structures to index words. Available options are' 'avl, unsorted, sorted, chain, probe, splay, rb, dict,' ' and od.') if len(sys.argv) == 1: parser.print_help() sys.exit(1) args = parser.parse_args() if args: indexer = Indexer(args.original, args.preprocessed, args.indexed, args.map_type) indexer.index() print(indexer) indexer.startUI() indexer.dump() else: parser.print_help()
def run_batch(self, docs, batchNo, mapper): print "Running Batch %s" % batchNo indexer = Indexer(docs, mapper, self.catalogs, self.docLengths) indexer.index(keep_stopwords=True, stem=False)
import Queue reportQueue = Queue.Queue(200) from HttpServer import HttpServerThread import time from Logger import * if __name__=="__main__": log_info("start indexer ...") cfgFile="../conf/conf.yml" conf = RygConf.load(cfgFile) httpd = HttpServerThread(conf.center_host, conf.center_port, conf.mon_hosts, reportQueue) httpd.start() log_info("httpd thread started" ) processors = list() for i in xrange(0, conf.threads): p = Indexer(reportQueue, conf) p.start() processors.append(p) log_info("processor threads started") #endless loop until terminated, do healthy check termly while True: if not httpd.isAlive(): log_critical("HTTPD thread exited, terminate for restart") break errorFlag = False for p in processors: if not p.isAlive(): errorFlag = True break if errorFlag: log_critical("ReportProcessor thread exited, terminate for restart")