예제 #1
0
 def __init__(self,
              searchPhrase,
              dbname='TwitterDB',
              host='localhost',
              port=27017,
              query=None,
              k=0):
     self.queries = Queries(dbname=dbname, host=host, port=port)
     self.words = [
         word.split('/')[0] for word in lemmatize(
             cleanText.removeStopWords(
                 cleanText.cleanText(searchPhrase)[0]))
     ]
     self.idfs = dict()
     and_list = []
     if self.words:
         for word in self.words:
             and_list.append({'words.word': word})
         self.query_search = {"$and": and_list}
         if query:
             self.existing = True
             self.query_search.update(query)
         else:
             self.existing = False
         self.k = k
예제 #2
0
def populateDatabase(elems, language='EN', dbname='TwitterDB', host='localhost', port=27017, mode=0, serialized=True):
    if elems:
        documents = []
        if serialized:
            # single thread
            for elem in elems:
                if len(elem) >= 7:
                    document = processElement_serial(elem, language, mode)
                    if document:
                        documents.append(document)
                else:
                    try:
                        print "tweet with problems: ", elem[0]
                    except Exception, e:
                        print e
        else:
            # parallelized
            global language_global
            language_global = language
            global mode_global
            mode_global = mode
            no_threads = cpu_count()
            with ProcessPoolExecutor(max_workers=no_threads) as worker:
                for result in worker.map(processElement_parallel, elems):
                    if result:
                        documents.append(result)
        if documents:
            queries = Queries(dbname=dbname, host=host, port=port)
            queries.bulkInsert(documents=documents)
예제 #3
0
def populateDatabase(elems,
                     language='EN',
                     dbname='TwitterDB',
                     host='localhost',
                     port=27017,
                     mode=0,
                     serialized=True):
    if elems:
        documents = []
        if serialized:
            # single thread
            for elem in elems:
                if len(elem) >= 7:
                    document = processElement_serial(elem, language, mode)
                    if document:
                        documents.append(document)
                else:
                    try:
                        print "tweet with problems: ", elem[0]
                    except Exception, e:
                        print e
        else:
            # parallelized
            global language_global
            language_global = language
            global mode_global
            mode_global = mode
            no_threads = cpu_count()
            with ProcessPoolExecutor(max_workers=no_threads) as worker:
                for result in worker.map(processElement_parallel, elems):
                    if result:
                        documents.append(result)
        if documents:
            queries = Queries(dbname=dbname, host=host, port=port)
            queries.bulkInsert(documents=documents)
예제 #4
0
def main(filename,
         csv_delimiter='\t',
         header=True,
         dbname='TwitterDB',
         host='localhost',
         port=27017,
         language='EN',
         initialize=0,
         mode=0,
         serialized=False):
    # print mode, serialized, header
    # initialize everything from the stat
    if initialize == 0:
        queries = Queries(dbname=dbname, host=host, port=port)
        queries.dropDocuments()
    populateDB(filename,
               csv_delimiter,
               header,
               language,
               dbname=dbname,
               host=host,
               port=port,
               mode=mode,
               serialized=serialized)
    constructIndexes(dbname, host, port)
예제 #5
0
def main(filename, csv_delimiter='\t', header=True, dbname='TwitterDB', host='localhost', port=27017, language='EN', initialize=0, mode=0, serialized=False):
    # print mode, serialized, header
    # initialize everything from the stat
    if initialize == 0:
        queries = Queries(dbname=dbname, host=host,port=port)
        queries.dropDocuments()
    populateDB(filename, csv_delimiter, header, language, dbname=dbname, host=host, port=port, mode=mode, serialized=serialized)
    constructIndexes(dbname, host, port)
예제 #6
0
def constructIndexes(dbname, host, port):
    #build Vocabulary
    queries = Queries(dbname=dbname, host=host, port=port)
    start = time.time()
    queries.constructVocabulary()
    end = time.time()
    print "vocabulary_build.append(", (end - start) , ")"

    # built the NE Index
    start = time.time()
    queries.constructNamedEntities()
    end = time.time()
    print "ne_build.append(", (end - start) , ")"
예제 #7
0
def populateDatabase(elems,dbname='OLAPDB', mode=0):
    global global_mode
    global_mode = mode
    if elems:
        documents = []
        no_threads = cpu_count()
        with ProcessPoolExecutor(max_workers=no_threads) as worker:
            for result in worker.map(processElement, elems):
                if result:
                    documents.append(result)
    if documents:
        queries = Queries(dbname=dbname)
        queries.bulkInsert(documents=documents)
예제 #8
0
class Search:
    def __init__(self, searchPhrase, dbname='TwitterDB', host='localhost', port=27017, query=None, k=0):
        self.queries = Queries(dbname=dbname, host=host, port=port)
        self.words = [word.split('/')[0] for word in lemmatize(cleanText.removeStopWords(cleanText.cleanText(searchPhrase)[0]))]
        self.idfs = dict()
        and_list = []
        if self.words:
            for word in self.words:
                and_list.append({'words.word': word})
            self.query_search = {"$and" : and_list}
            if query:
                self.existing = True
                self.query_search.update(query)
            else:
                self.existing = False
            self.k = k


    def results(self):
        list_documents = []
        if self.words:
            # get ids for the search words
            idf = self.queries.getWords(query={'word': {"$in": self.words}}, fields={'word': 1, 'IDF': 1}, existing=self.existing)
            for word in idf:
                self.idfs[word['word']] = word['IDF']
            # get documents
            fields = {'_id': 1, 'author': 1, 'date': 1, 'rawText': 1, 'words.word': 1, 'words.tf': 1}
            documents = self.queries.getDocuments(query=self.query_search, fields=fields)
            

            # serial version
            for doc in documents:
                list_documents.append(self.process(doc))
        return list_documents

    def process(self, elem):
        document = dict()
        document['id'] = elem['_id']
        document['rawText'] = elem['rawText']
        document['author'] = elem['author']
        document['date'] = elem['date']
        score = 0
        for word in self.idfs:
            for tfs in elem['words']:
                if tfs.get(word, -1) == -1:
                    tf = tfs['tf']
                    score += tf*self.idfs[word]
        document['score'] = math.log(1+score, 2)
        return document
예제 #9
0
 def __init__(self, searchPhrase, dbname='TwitterDB', query=None, k=0):
     self.queries = Queries(dbname)
     self.words = [word.split('/')[0] for word in lemmatize(cleanText.removeStopWords(cleanText.cleanText(searchPhrase)[0]))]
     self.idfs = dict()
     and_list = []
     for word in self.words:
         and_list.append({'words.word': word})
     self.query_search = {"$and" : and_list}
     if query:
         self.existing = True
         self.query_search.update(query)
     else:
         self.existing = False
     self.k = k
예제 #10
0
def constructIndexes(dbname, host, port):
    #build Vocabulary
    queries = Queries(dbname=dbname, host=host, port=port)
    start = time.time()
    queries.constructVocabulary()
    end = time.time()
    print "vocabulary_build.append(", (end - start), ")"

    # built the NE Index
    start = time.time()
    queries.constructNamedEntities()
    end = time.time()
    print "ne_build.append(", (end - start), ")"
예제 #11
0
class Search:
    def __init__(self, searchPhrase, dbname='TwitterDB', query=None, k=0):
        self.queries = Queries(dbname)
        self.words = [word.split('/')[0] for word in lemmatize(cleanText.removeStopWords(cleanText.cleanText(searchPhrase)[0]))]
        self.idfs = dict()
        and_list = []
        for word in self.words:
            and_list.append({'words.word': word})
        self.query_search = {"$and" : and_list}
        if query:
            self.existing = True
            self.query_search.update(query)
        else:
            self.existing = False
        self.k = k


    def results(self):
        # get ids for the search words
        idf = self.queries.getWords(query={'word': {"$in": self.words}}, fields={'word': 1, 'IDF': 1}, existing=self.existing)
        for word in idf:
            self.idfs[word['word']] = word['IDF']
        # get documents
        fields = {'_id': 1, 'author': 1, 'date': 1, 'rawText': 1, 'words.word': 1, 'words.tf': 1}
        documents = self.queries.getDocuments(query=self.query_search, fields=fields)
        list_documents = []
        # parallelized version -  doesn't work yet
        # no_threads = cpu_count()
        # with ProcessPoolExecutor(max_workers=no_threads) as worker:
        #         for result in worker.map(self.process, documents):
        #             if result:
        #                 list_documents.append(result)

        # serial version
        for doc in documents:
            document = dict()
            document['id'] = doc['_id']
            document['rawText'] = doc['rawText']
            document['author'] = doc['author']
            document['date'] = doc['date']
            score = 0
            for word in self.idfs:
                tf = 1
                for tfs in doc['words']:
                    if tfs.get(word, 1) == 1:
                        tf = tfs['tf']
                score += tf*self.idfs[word];
            document['score'] = math.log(1+score, 2)
            list_documents.append(document)
            list_documents.append(self.process(doc))

        return list_documents

    def process(self, elem):
        document = dict()
        document['id'] = elem['_id']
        document['rawText'] = elem['rawText']
        document['author'] = elem['author']
        document['date'] = elem['date']
        score = 0
        for word in self.idfs:
            tf = 1
            for tfs in elem['words']:
                if tfs.get(word, 1) == 1:
                    tf = tfs['tf']
            score += tf*self.idfs[word]
        document['score'] = math.log(1+score, 2)
        return document
예제 #12
0
from mabed.mabed_files import MabedFiles
import subprocess
import os, shutil
from functools import wraps
import threading
import pickle
from streaming.stream import Streaming
import datetime
from indexing.queries import Queries

# Connecting to the database

dbname = 'TwitterDB_demo'
host = 'localhost'
port = 27017
queries = Queries(dbname=dbname, host=host, port=port)
can_collect_tweets = False
lda_running = False
mabed_running = False

app = Flask(__name__)

query = {}

query_pretty = ""


def check_auth(username, password):
    return username == 'demo' and password == 'ilikecats'

예제 #13
0
class Search:
    def __init__(self,
                 searchPhrase,
                 dbname='TwitterDB',
                 host='localhost',
                 port=27017,
                 query=None,
                 k=0):
        self.queries = Queries(dbname=dbname, host=host, port=port)
        self.words = [
            word.split('/')[0] for word in lemmatize(
                cleanText.removeStopWords(
                    cleanText.cleanText(searchPhrase)[0]))
        ]
        self.idfs = dict()
        and_list = []
        if self.words:
            for word in self.words:
                and_list.append({'words.word': word})
            self.query_search = {"$and": and_list}
            if query:
                self.existing = True
                self.query_search.update(query)
            else:
                self.existing = False
            self.k = k

    def results(self):
        list_documents = []
        if self.words:
            # get ids for the search words
            idf = self.queries.getWords(query={'word': {
                "$in": self.words
            }},
                                        fields={
                                            'word': 1,
                                            'IDF': 1
                                        },
                                        existing=self.existing)
            for word in idf:
                self.idfs[word['word']] = word['IDF']
            # get documents
            fields = {
                '_id': 1,
                'author': 1,
                'date': 1,
                'rawText': 1,
                'words.word': 1,
                'words.tf': 1
            }
            documents = self.queries.getDocuments(query=self.query_search,
                                                  fields=fields)

            # serial version
            for doc in documents:
                list_documents.append(self.process(doc))
        return list_documents

    def process(self, elem):
        document = dict()
        document['id'] = elem['_id']
        document['rawText'] = elem['rawText']
        document['author'] = elem['author']
        document['date'] = elem['date']
        score = 0
        for word in self.idfs:
            for tfs in elem['words']:
                if tfs.get(word, -1) == -1:
                    tf = tfs['tf']
                    score += tf * self.idfs[word]
        document['score'] = math.log(1 + score, 2)
        return document