def __init__(self, searchPhrase, dbname='TwitterDB', host='localhost', port=27017, query=None, k=0): self.queries = Queries(dbname=dbname, host=host, port=port) self.words = [ word.split('/')[0] for word in lemmatize( cleanText.removeStopWords( cleanText.cleanText(searchPhrase)[0])) ] self.idfs = dict() and_list = [] if self.words: for word in self.words: and_list.append({'words.word': word}) self.query_search = {"$and": and_list} if query: self.existing = True self.query_search.update(query) else: self.existing = False self.k = k
def populateDatabase(elems, language='EN', dbname='TwitterDB', host='localhost', port=27017, mode=0, serialized=True): if elems: documents = [] if serialized: # single thread for elem in elems: if len(elem) >= 7: document = processElement_serial(elem, language, mode) if document: documents.append(document) else: try: print "tweet with problems: ", elem[0] except Exception, e: print e else: # parallelized global language_global language_global = language global mode_global mode_global = mode no_threads = cpu_count() with ProcessPoolExecutor(max_workers=no_threads) as worker: for result in worker.map(processElement_parallel, elems): if result: documents.append(result) if documents: queries = Queries(dbname=dbname, host=host, port=port) queries.bulkInsert(documents=documents)
def populateDatabase(elems, language='EN', dbname='TwitterDB', host='localhost', port=27017, mode=0, serialized=True): if elems: documents = [] if serialized: # single thread for elem in elems: if len(elem) >= 7: document = processElement_serial(elem, language, mode) if document: documents.append(document) else: try: print "tweet with problems: ", elem[0] except Exception, e: print e else: # parallelized global language_global language_global = language global mode_global mode_global = mode no_threads = cpu_count() with ProcessPoolExecutor(max_workers=no_threads) as worker: for result in worker.map(processElement_parallel, elems): if result: documents.append(result) if documents: queries = Queries(dbname=dbname, host=host, port=port) queries.bulkInsert(documents=documents)
def main(filename, csv_delimiter='\t', header=True, dbname='TwitterDB', host='localhost', port=27017, language='EN', initialize=0, mode=0, serialized=False): # print mode, serialized, header # initialize everything from the stat if initialize == 0: queries = Queries(dbname=dbname, host=host, port=port) queries.dropDocuments() populateDB(filename, csv_delimiter, header, language, dbname=dbname, host=host, port=port, mode=mode, serialized=serialized) constructIndexes(dbname, host, port)
def main(filename, csv_delimiter='\t', header=True, dbname='TwitterDB', host='localhost', port=27017, language='EN', initialize=0, mode=0, serialized=False): # print mode, serialized, header # initialize everything from the stat if initialize == 0: queries = Queries(dbname=dbname, host=host,port=port) queries.dropDocuments() populateDB(filename, csv_delimiter, header, language, dbname=dbname, host=host, port=port, mode=mode, serialized=serialized) constructIndexes(dbname, host, port)
def constructIndexes(dbname, host, port): #build Vocabulary queries = Queries(dbname=dbname, host=host, port=port) start = time.time() queries.constructVocabulary() end = time.time() print "vocabulary_build.append(", (end - start) , ")" # built the NE Index start = time.time() queries.constructNamedEntities() end = time.time() print "ne_build.append(", (end - start) , ")"
def populateDatabase(elems,dbname='OLAPDB', mode=0): global global_mode global_mode = mode if elems: documents = [] no_threads = cpu_count() with ProcessPoolExecutor(max_workers=no_threads) as worker: for result in worker.map(processElement, elems): if result: documents.append(result) if documents: queries = Queries(dbname=dbname) queries.bulkInsert(documents=documents)
class Search: def __init__(self, searchPhrase, dbname='TwitterDB', host='localhost', port=27017, query=None, k=0): self.queries = Queries(dbname=dbname, host=host, port=port) self.words = [word.split('/')[0] for word in lemmatize(cleanText.removeStopWords(cleanText.cleanText(searchPhrase)[0]))] self.idfs = dict() and_list = [] if self.words: for word in self.words: and_list.append({'words.word': word}) self.query_search = {"$and" : and_list} if query: self.existing = True self.query_search.update(query) else: self.existing = False self.k = k def results(self): list_documents = [] if self.words: # get ids for the search words idf = self.queries.getWords(query={'word': {"$in": self.words}}, fields={'word': 1, 'IDF': 1}, existing=self.existing) for word in idf: self.idfs[word['word']] = word['IDF'] # get documents fields = {'_id': 1, 'author': 1, 'date': 1, 'rawText': 1, 'words.word': 1, 'words.tf': 1} documents = self.queries.getDocuments(query=self.query_search, fields=fields) # serial version for doc in documents: list_documents.append(self.process(doc)) return list_documents def process(self, elem): document = dict() document['id'] = elem['_id'] document['rawText'] = elem['rawText'] document['author'] = elem['author'] document['date'] = elem['date'] score = 0 for word in self.idfs: for tfs in elem['words']: if tfs.get(word, -1) == -1: tf = tfs['tf'] score += tf*self.idfs[word] document['score'] = math.log(1+score, 2) return document
def __init__(self, searchPhrase, dbname='TwitterDB', query=None, k=0): self.queries = Queries(dbname) self.words = [word.split('/')[0] for word in lemmatize(cleanText.removeStopWords(cleanText.cleanText(searchPhrase)[0]))] self.idfs = dict() and_list = [] for word in self.words: and_list.append({'words.word': word}) self.query_search = {"$and" : and_list} if query: self.existing = True self.query_search.update(query) else: self.existing = False self.k = k
def constructIndexes(dbname, host, port): #build Vocabulary queries = Queries(dbname=dbname, host=host, port=port) start = time.time() queries.constructVocabulary() end = time.time() print "vocabulary_build.append(", (end - start), ")" # built the NE Index start = time.time() queries.constructNamedEntities() end = time.time() print "ne_build.append(", (end - start), ")"
class Search: def __init__(self, searchPhrase, dbname='TwitterDB', query=None, k=0): self.queries = Queries(dbname) self.words = [word.split('/')[0] for word in lemmatize(cleanText.removeStopWords(cleanText.cleanText(searchPhrase)[0]))] self.idfs = dict() and_list = [] for word in self.words: and_list.append({'words.word': word}) self.query_search = {"$and" : and_list} if query: self.existing = True self.query_search.update(query) else: self.existing = False self.k = k def results(self): # get ids for the search words idf = self.queries.getWords(query={'word': {"$in": self.words}}, fields={'word': 1, 'IDF': 1}, existing=self.existing) for word in idf: self.idfs[word['word']] = word['IDF'] # get documents fields = {'_id': 1, 'author': 1, 'date': 1, 'rawText': 1, 'words.word': 1, 'words.tf': 1} documents = self.queries.getDocuments(query=self.query_search, fields=fields) list_documents = [] # parallelized version - doesn't work yet # no_threads = cpu_count() # with ProcessPoolExecutor(max_workers=no_threads) as worker: # for result in worker.map(self.process, documents): # if result: # list_documents.append(result) # serial version for doc in documents: document = dict() document['id'] = doc['_id'] document['rawText'] = doc['rawText'] document['author'] = doc['author'] document['date'] = doc['date'] score = 0 for word in self.idfs: tf = 1 for tfs in doc['words']: if tfs.get(word, 1) == 1: tf = tfs['tf'] score += tf*self.idfs[word]; document['score'] = math.log(1+score, 2) list_documents.append(document) list_documents.append(self.process(doc)) return list_documents def process(self, elem): document = dict() document['id'] = elem['_id'] document['rawText'] = elem['rawText'] document['author'] = elem['author'] document['date'] = elem['date'] score = 0 for word in self.idfs: tf = 1 for tfs in elem['words']: if tfs.get(word, 1) == 1: tf = tfs['tf'] score += tf*self.idfs[word] document['score'] = math.log(1+score, 2) return document
from mabed.mabed_files import MabedFiles import subprocess import os, shutil from functools import wraps import threading import pickle from streaming.stream import Streaming import datetime from indexing.queries import Queries # Connecting to the database dbname = 'TwitterDB_demo' host = 'localhost' port = 27017 queries = Queries(dbname=dbname, host=host, port=port) can_collect_tweets = False lda_running = False mabed_running = False app = Flask(__name__) query = {} query_pretty = "" def check_auth(username, password): return username == 'demo' and password == 'ilikecats'
class Search: def __init__(self, searchPhrase, dbname='TwitterDB', host='localhost', port=27017, query=None, k=0): self.queries = Queries(dbname=dbname, host=host, port=port) self.words = [ word.split('/')[0] for word in lemmatize( cleanText.removeStopWords( cleanText.cleanText(searchPhrase)[0])) ] self.idfs = dict() and_list = [] if self.words: for word in self.words: and_list.append({'words.word': word}) self.query_search = {"$and": and_list} if query: self.existing = True self.query_search.update(query) else: self.existing = False self.k = k def results(self): list_documents = [] if self.words: # get ids for the search words idf = self.queries.getWords(query={'word': { "$in": self.words }}, fields={ 'word': 1, 'IDF': 1 }, existing=self.existing) for word in idf: self.idfs[word['word']] = word['IDF'] # get documents fields = { '_id': 1, 'author': 1, 'date': 1, 'rawText': 1, 'words.word': 1, 'words.tf': 1 } documents = self.queries.getDocuments(query=self.query_search, fields=fields) # serial version for doc in documents: list_documents.append(self.process(doc)) return list_documents def process(self, elem): document = dict() document['id'] = elem['_id'] document['rawText'] = elem['rawText'] document['author'] = elem['author'] document['date'] = elem['date'] score = 0 for word in self.idfs: for tfs in elem['words']: if tfs.get(word, -1) == -1: tf = tfs['tf'] score += tf * self.idfs[word] document['score'] = math.log(1 + score, 2) return document