Пример #1
0
class BagOfWordsFeatureBooleanizer(FeatureBooleanizer):
  def __init__(self, featureName, featuresData, featureId):
    FeatureBooleanizer.__init__(self, featureName, featuresData, featureId)
    self.stemmer = Stemmer('english')
    self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789')
    stopListFn = './resources/general/stopword.csv'
    self.stopList = frozenset(l for l in filter(None, map(lambda l: self.preprocess(l), open(stopListFn, 'rt').readlines())))
    allWords = set()
    if self.featureName == 'Basic: Tagline':
      for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split(','))))
    else:
      for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split())))
    self.words = sorted(list(filter(None, allWords - self.stopList)))
  
  def preprocess(self, s):
    chars = []
    for c in unidecode(s.strip().lower()):
      if c in self.goodChars:
        chars.append(c)
    word = ''.join(chars)
    return self.stemmer.stemWord(word)
  
  def getFeatureNames(self):
    return [self.featureName + ': ' + word for word in self.words]
  
  def process(self, v):
    vWords = set(map(lambda w: self.preprocess(w), filter(None, v.split(','))))
    return [(word in vWords) for word in self.words]
Пример #2
0
def stemmer(tokens):
    # ps = PorterStemmer()
    # tokens = [ps.stem(w) for w in tokens]

    ps = Stemmer('porter')
    tokens = [ps.stemWord(w) for w in tokens]
    return tokens
Пример #3
0
def run():
    stemmer = Stemmer("english")
    pages = db.en.find()
    print colored.yellow("statistic words") 
    wordstatistic = {}
    for page in progress.bar(pages,size=db.en.count()):
        data = page.get("data")
        if not data:continue
        content = data.get("content")
        if not content:
            db.en.remove({"_id":page["_id"]})
            continue
        words = EN_WORD_CUT.split(content)
        for word in words:
            w=stemmer.stemWord(word.strip()).lower()
            if w and len(w)<20 and not w in EN_IGNORE:
                if wordstatistic.get(w):
                    wordstatistic[w]+=1
                else:
                    wordstatistic[w]=1

    
    print colored.yellow("save to en_words_freq")
    savequene = []
    for k,v in progress.bar(wordstatistic.iteritems(),size=len(wordstatistic)):
        savequene.append({"_id":k,"freq":v})
        if len(savequene) >=1000:
            db.en_words_freq.insert(savequene)
            savequene=[]
        
    if savequene:db.en_words_freq.insert(savequene)
    print colored.cyan(
            "count of en_words_freq: %d" % db.en_words_freq.count())
Пример #4
0
class StemProvider(Provider):
    """Stem the input values (either a single word or a list of words)

    Uses the porter stemmer algorithm.
    """
    def __init__(self, language='english', **kwargs):
        """
        See here for a full list of languages:

            http://nltk.org/_modules/nltk/stem/snowball.html

        .. note::

            This does not depend on nltk, it depends on the ``pystemmer`` package.

        :param language: language to use during stemming, defaults to english.
        """
        Provider.__init__(self, **kwargs)
        self._stemmer = Stemmer(language)

    def do_process(self, input_value):
        if isinstance(input_value, str):
            return self._stemmer.stemWord(input_value)
        else:
            return self._stemmer.stemWords(input_value)
Пример #5
0
class BagOfWordsFeatureSupport(FeatureSupport):
  def __init__(self, featuresData, featureId):
    FeatureSupport.__init__(self, featuresData, featureId)
    self.stemmer = Stemmer('english')
    self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789')
    stopListFn = './resources/general/stopword.csv'
    self.stopList = frozenset(l for l in filter(None, map(lambda l: self.preprocess(l), open(stopListFn, 'rt').readlines())))
  
  def preprocess(self, s):
    chars = []
    for c in unidecode(s.strip().lower()):
      if c in self.goodChars:
        chars.append(c)
    word = ''.join(chars)
    return self.stemmer.stemWord(word)
  
  def extract(self, i):
    bag = frozenset(map(lambda w: self.preprocess(w), filter(None, self[i].split())))
    ret = bag - self.stopList
    if len(ret) == 0: ret = frozenset([''.join(random.choice('abcdefghjiklmnopqrstuvwxyz') for _ in range(20))])
    return ret
  
  def similarity(self, a, b):
    num = len(a & b)
    den = len(a | b)
    return num / den if den != 0 else 1.0
Пример #6
0
def getStems(cleanedText, stopWords):
    stems = {}
    matches = re.finditer(r'\w+(\.?\w+)*', cleanedText.strip(), flags=re.IGNORECASE)
    stemmer = Stemmer('english')
    #maxlength = sum(1 for _ in matches1)
    #stemmer.maxCacheSize = maxlength
    offset = len(termDict)
    tokenid = offset + 1
    position = 0
    for match in matches:
        #position = match.start()
        position += 1 
        token = match.group()
        filteredToken = filterToken(token, stopWords)
        if filteredToken and filteredToken is not None:
            wordStem = stemmer.stemWord(filteredToken.lower())
            #present = wordStem in stems
            if wordStem not in stems:
                #tokenid += 1
                stems[wordStem] = tokenid
                positions = set()
                positions.add(position)
                if wordStem not in termDict:
                    termDict[wordStem] = tokenid
                    terms[tokenid] = positions
                    tokenid = tokenid + 1
                else:
                    stemid = termDict[wordStem] 
                    terms[stemid] = positions
            else:
                stemid = termDict[wordStem]
                postns = terms[stemid]
                postns.add(position)
                terms[stemid] = postns
Пример #7
0
def tokenise(value, identifier, category, content_stop):
    token_list = []
    final_list = []
    value = re.sub(exclude1, " ", value)
    value = re.sub(exclude2, " ", value)
    value = re.sub(r'[^a-zA-Z]', " ", value)
    value = value.lower()
    if category == 'e':
        value = re.sub(r'(http|www|com)', " ", value)
    if category == 'c':
        value = re.sub(r'category', " ", value)
    token_list = value.split()
    for w in token_list:
        if w not in content_stop.keys():
            final_list.append(w)
#    stemmer = PorterStemmer()
    stemmer = Stemmer("english")
    final_list = [stemmer.stemWord(key) for key in final_list]
    #    final_list = [stemmer.stem(plural,0, len(plural)-1) for plural in final_list]
    if final_list:
        #call next function here.
        return (final_list)
    ####after work of token_list is done####
    token_list = []
    final_list = []
def textHandler(text):
    #print(text)
    #stop_word = {}
    #tokenizing
    text = text.encode('ascii', errors='ignore').decode()
    text = re.sub(r'[^A-Za-z0-9]+', r' ', text)
    #tokens = nltk.word_tokenize(text)#tokenizing
    #stop word removal
    #uwords = [word for word in tokens if word not in stop_word.keys()]#stop word removal
    #print('remove',uwords)
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filter_sentence = [w for w in word_tokens if not w in stop_words]
    # filter_sentence = []
    # for w in word_tokens:
    #     if w not in stop_words:
    #         filter_sentence.append(w)

    stemmer = Stemmer('porter')
    stem_text = []
    for word in filter_sentence:
        stem_text.append(stemmer.stemWord(word))
    #print(filter_sentence)
    # print('before',len(filter_sentence))
    # print('after',len(stemming(stem_text)))
    return stem_text
Пример #9
0
def stem(datalist):  #Stemming
    stemmer = Stemmer("english")
    tmp = []
    for x in datalist:
        y = stemmer.stemWord(x)
        tmp.append(y)
    return tmp
Пример #10
0
            class Stemmer(object):
                def __init__(self):
                    # type: () -> None
                    self.stemmer = PyStemmer('porter')

                def stem(self, word):
                    # type: (unicode) -> unicode
                    return self.stemmer.stemWord(word)
Пример #11
0
            class Stemmer(object):
                def __init__(self):
                    # type: () -> None
                    self.stemmer = PyStemmer('porter')

                def stem(self, word):
                    # type: (unicode) -> unicode
                    return self.stemmer.stemWord(word)
Пример #12
0
def apply_snowball_stemmer(tagged_sentences: List[List[Tuple]], stemmer: Stemmer) -> List[List[Tuple]]:
    stemmed_sentences = []
    for sentence in tagged_sentences:
        stemmed = []
        for pos, text_repr, surface_repr, tag in sentence:
            word = surface_repr.lower() if tag in [CONTENT_WORD_TAG, STOPWORD_TAG] else surface_repr
            stemmed.append((pos, text_repr, surface_repr, stemmer.stemWord(word), tag))
        stemmed_sentences.append(stemmed)
    return stemmed_sentences
Пример #13
0
def make_index(expression):
    """
    Make a standardization in the expression to return a tuple who maximise
    maching possibilities.
    expression must be a list or tuple
    """
    stemmer = Stemmer("french")
    expression = [stemmer.stemWord(normalize_token(w)) for w in expression]
    expression.sort()
    return tuple(expression)
Пример #14
0
def processQueries(queries):
    queryList = []
    for query in queries:
        filteredQuery = tokenize.filterToken(query, tokenize.getStopWords())
        if filteredQuery and filteredQuery is not None:
            stemmer = Stemmer('english')
            queryStem = stemmer.stemWord(filteredQuery.lower())
            queryList.append(queryStem)
    
    return queryList
Пример #15
0
def processQueries(queries):
    queryList = []
    for query in queries:
        filteredQuery = tokenize.filterToken(query, tokenize.getStopWords())
        if filteredQuery and filteredQuery is not None:
            stemmer = Stemmer('english')
            queryStem = stemmer.stemWord(filteredQuery.lower())
            queryList.append(queryStem)

    return queryList
Пример #16
0
    def parse_html(html):
        words = dehtml(html)

        s = Stemmer("danish")

        result = []
        for w in words.split():
            word = w.lower()
            if word in stop_words or len(word) < 2 or word.count('\\'):
                continue

            result.append(s.stemWord(word))
        return result
Пример #17
0
    def parse_html(html):
        words = dehtml(html)

        s = Stemmer("danish")

        result = []
        for w in words.split():
            word = w.lower()
            if word in stop_words or len(word) < 2 or word.count('\\'):
                continue

            result.append(s.stemWord(word))
        return result
Пример #18
0
def getTerm(term):
    term_ids = {}
    term_ids_file = open(TERMIDSFILE, 'rU')

    for line in term_ids_file.readlines():
        pieces = line.strip().split('\t')
        stemmer = Stemmer('english')
        #stemmer.maxCacheSize = 1
        termStem = stemmer.stemWord(term.lower())
        if termStem == pieces[1]:
            term_ids[pieces[1]] = int(pieces[0])
            return term_ids

    term_ids_file.close()
    return term_ids
Пример #19
0
def getTerm(term):
    term_ids = {}
    term_ids_file = open(TERMIDSFILE, 'rU')
    
    for line in term_ids_file.readlines():
        pieces = line.strip().split('\t')
        stemmer = Stemmer('english')
        #stemmer.maxCacheSize = 1
        termStem = stemmer.stemWord(term.lower())
        if termStem == pieces[1]:
            term_ids[pieces[1]] = int(pieces[0])
            return term_ids
    
    term_ids_file.close()
    return term_ids
Пример #20
0
def cleanQuery(data):
    global StopWords, Stemmer, extension
    data = data.lower()
    data = re.sub(r'<(.*?)>', '', data)  # Remove HTML Tags
    data = re.sub(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
        '',
        data,
        flags=re.MULTILINE)  # Remove Url
    data = re.sub('[^A-Za-z0-9]+', ' ', data)  # Remove Special characters
    token_list = word_tokenize(data)  # Tokenize String
    token_list = [
        word for word in token_list
        if word not in StopWords and word not in extension
    ]  # Remove StopWords and Extended StopWords
    tokens_list = [Stemmer.stemWord(word) for word in token_list]
    return tokens_list
Пример #21
0
class TextEater(object):
    
    def __init__(self):
        self.stoplist = gen_stops()
        self.stemmer = Stemmer('english')
    
    @coroutine
    def sent_filter(self,target):
        word = ''
        print "ready to eat lines"
        while True:
            sentence = (yield)
            target.send((sentence.lower()).split())

    @coroutine
    def word_filter(self, target):
        print "ready to eat words"
        while True:
            raw = (yield)
            target.send([self.stemmer.stemWord(w) for w in raw if len(w)<=3 or 
                    w in self.stoplist])


    @coroutine
    def ngrams(self,container, n=2,):
        "Compute n-grams" 
        while True:
            grams= (yield)
            for i in range(0, len((grams)) - (n - 1)):
                container[(tuple(grams[i:i+n]))]+=1
               
    @coroutine
    def printer(self):
        while True:
            line = (yield)
            print (line)

    @coroutine
    def typer(self,target):
        print "ready to check type"
        word = None
        while True:
            line = (yield word)
            word=  type(line)
Пример #22
0
def cleanData(data):
    global StopWords, Stemmer, total_count
    data = re.sub(r'<(.*?)>', '', data)  # Remove HTML Tags
    data = re.sub(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
        '',
        data,
        flags=re.MULTILINE)  #Remove Url
    data = re.sub('[^A-Za-z0-9]+', ' ',
                  data)  # Remove Punctuation and Special Characters
    token_list = word_tokenize(data)  # Tokenize String
    total_count += len(token_list)
    token_list = [
        word for word in token_list
        if word not in StopWords and word not in extension
    ]  #Remove Stopwords and Extended Stopwords
    token_list = [Stemmer.stemWord(word) for word in token_list]  #Stem words

    return token_list
Пример #23
0
def index(text, accepted_languages=None, langs=None):
    registry = get_current_registry()
    if accepted_languages == None:
        accepted_languages = [x.strip() for x in
                              registry.settings["accepted_languages"].split(","
                              )]
    if langs == None:
        lang = guessLanguage(text)
        if lang not in accepted_languages:
            langs = accepted_languages
        else:
            langs = [lang]
    langs = list(set(langs).intersection(set(accepted_languages)))
    if not langs:
        langs = accepted_languages
    indexed_words = set()
    for lang in langs:
        stemmer = Stemmer(lang)
        indexed_words.update([stemmer.stemWord(x.value) for x in
                             tokenize(text)])
    return indexed_words
Пример #24
0
def index(text, accepted_languages=None, langs=None):
    registry = get_current_registry()
    if accepted_languages == None:
        accepted_languages = [
            x.strip()
            for x in registry.settings["accepted_languages"].split(",")
        ]
    if langs == None:
        lang = guessLanguage(text)
        if lang not in accepted_languages:
            langs = accepted_languages
        else:
            langs = [lang]
    langs = list(set(langs).intersection(set(accepted_languages)))
    if not langs:
        langs = accepted_languages
    indexed_words = set()
    for lang in langs:
        stemmer = Stemmer(lang)
        indexed_words.update(
            [stemmer.stemWord(x.value) for x in tokenize(text)])
    return indexed_words
Пример #25
0
class Overview(Feature):
  description = """
Basic: Overview
""".strip()

  def __init__(self, *args, **kwargs):
    Feature.__init__(self)
    self.stemmer = Stemmer('english')
    self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789')
    self.stopList = frozenset(['a', 'abaft', 'aboard', 'about', 'abov', 'absent', 'accord', 'account', 'across', 'addit', 'afor', 'after', 'against', 'ago', 'ahead', 'all', 'along', 'alongsid', 'alreadi', 'also', 'am', 'amid', 'amidst', 'among', 'amongst', 'an', 'and', 'anenst', 'ani', 'anoth', 'anybodi', 'anyhow', 'anyon', 'anyth', 'anywher', 'apart', 'apr', 'april', 'apropo', 'apud', 'are', 'around', 'as', 'asid', 'astrid', 'at', 'athwart', 'atop', 'aug', 'august', 'back', 'bad', 'bar', 'be', 'becaus', 'been', 'befor', 'begin', 'behalf', 'behest', 'behind', 'below', 'beneath', 'besid', 'best', 'better', 'between', 'beyond', 'big', 'bigger', 'biggest', 'billion', 'blah', 'bln', 'both', 'but', 'by', 'c', 'ca', 'call', 'can', 'cannot', 'cant', 'case', 'circa', 'close', 'concern', 'could', 'couldt', 'current', 'daili', 'day', 'dec', 'decemb', 'despit', 'did', 'do', 'doe', 'doesnt', 'done', 'dont', 'down', 'due', 'dure', 'each', 'eight', 'eighteen', 'eighth', 'eighti', 'eleven', 'end', 'enough', 'ever', 'except', 'exclud', 'fail', 'far', 'feb', 'februari', 'few', 'fifth', 'first', 'five', 'fiveteen', 'fivti', 'follow', 'for', 'forenenst', 'four', 'fourteen', 'fourth', 'fourti', 'fri', 'friday', 'from', 'front', 'full', 'further', 'get', 'given', 'go', 'gone', 'goot', 'had', 'hadnt', 'has', 'hasnt', 'have', 'havent', 'he', 'her', 'here', 'herself', 'high', 'higher', 'hightst', 'himself', 'his', 'how', 'hunderd', 'i', 'if', 'in', 'includ', 'insid', 'instead', 'into', 'is', 'it', 'itself', 'jan', 'januari', 'jul', 'juli', 'jun', 'june', 'just', 'last', 'late', 'later', 'latest', 'left', 'lest', 'lieu', 'like', 'littl', 'long', 'low', 'lower', 'lowest', 'made', 'make', 'mani', 'mar', 'march', 'may', 'me', 'mean', 'mid', 'midst', 'might', 'milliard', 'million', 'mine', 'minus', 'mld', 'mln', 'modulo', 'mon', 'monday', 'month', 'more', 'most', 'mth', 'much', 'must', 'my', 'myself', 'near', 'need', 'neednt', 'neither', 'never', 'next', 'nine', 'nineteen', 'nineth', 'nineti', 'no', 'none', 'nor', 'not', 'notwithstand', 'nov', 'novemb', 'number', 'o', 'oct', 'octob', 'of', 'off', 'on', 'one', 'onli', 'onto', 'oppos', 'opposit', 'or', 'order', 'other', 'ought', 'our', 'ourselv', 'out', 'outsid', 'over', 'owe', 'pace', 'past', 'per', 'place', 'plus', 'point', 'previous', 'prior', 'pro', 'pursuant', 'put', 'qua', 'rather', 'recent', 'regard', 'regardless', 'respect', 'right', 'round', 'said', 'sake', 'same', 'san', 'sat', 'saturday', 'save', 'saw', 'say', 'second', 'see', 'seen', 'sep', 'septemb', 'seven', 'seventeen', 'seventh', 'seventi', 'sever', 'shall', 'she', 'should', 'shouldnt', 'show', 'shown', 'sinc', 'six', 'sixteen', 'sixth', 'sixti', 'small', 'smaller', 'smallest', 'so', 'some', 'somebodi', 'somehow', 'someon', 'someth', 'somewher', 'soon', 'sooner', 'spite', 'start', 'still', 'subsequ', 'such', 'sun', 'sunday', 'take', 'taken', 'tell', 'ten', 'tenth', 'than', 'thank', 'that', 'the', 'their', 'them', 'themselv', 'there', 'these', 'they', 'third', 'thirteen', 'thirti', 'this', 'those', 'thousand', 'three', 'through', 'throughout', 'thru', 'thruout', 'thu', 'thursday', 'till', 'time', 'to', 'today', 'told', 'too', 'took', 'top', 'toward', 'tue', 'tuesday', 'twelv', 'twenti', 'two', 'under', 'underneath', 'unit', 'unlik', 'until', 'unto', 'up', 'upon', 'us', 'use', 'versus', 'via', 'vice', 'view', 'virtu', 'vis', 'visavi', 'vs', 'was', 'we', 'wed', 'wednesday', 'week', 'well', 'went', 'were', 'what', 'when', 'where', 'whether', 'whi', 'which', 'while', 'who', 'whose', 'will', 'with', 'within', 'without', 'wont', 'wors', 'worst', 'worth', 'would', 'wrt', 'xor', 'year', 'yes', 'yesterday', 'yet', 'you', 'your', 'yourself', 'yourselv', 'yr'])
  
  def preprocess(self, s):
    chars = []
    for c in unidecode(s.strip().lower()):
      if c in self.goodChars:
        chars.append(c)
    word = ''.join(chars)
    return self.stemmer.stemWord(word)
  
  def extract(self, m):
    t = m.overview
    return ','.join(sorted(list(set(filter(lambda w: len(w) > 0 and w not in self.stopList, map(self.preprocess, t.split()))))))
Пример #26
0
def nonField_query(path, text, secondary_index_list):
    #print(1)
    text = text.lower()
    text = text.encode('ascii', errors='ignore').decode()
    text = re.sub(r'[^A-Za-z0-9]+', r' ', text)
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filter_sentence = [w for w in word_tokens if not w in stop_words]
    # filter_sentence = []
    # for w in word_tokens:
    #     if w not in stop_words:
    #         filter_sentence.append(w)
    stemmer = Stemmer('porter')
    stem_text = []
    for word in filter_sentence:
        stem_text.append(stemmer.stemWord(word))
    #print(word)
    result_list = []
    #print(stem_text)
    for word in stem_text:
        result_list.append(Posting(secondary_index_list, word, path))
    return result_list
Пример #27
0
def getStems(cleanedText, stopWords):
    stems = {}
    matches = re.finditer(r'\w+(\.?\w+)*',
                          cleanedText.strip(),
                          flags=re.IGNORECASE)
    stemmer = Stemmer('english')
    #maxlength = sum(1 for _ in matches1)
    #stemmer.maxCacheSize = maxlength
    offset = len(termDict)
    tokenid = offset + 1
    position = 0
    for match in matches:
        #position = match.start()
        position += 1
        token = match.group()
        filteredToken = filterToken(token, stopWords)
        if filteredToken and filteredToken is not None:
            wordStem = stemmer.stemWord(filteredToken.lower())
            #present = wordStem in stems
            if wordStem not in stems:
                #tokenid += 1
                stems[wordStem] = tokenid
                positions = set()
                positions.add(position)
                if wordStem not in termDict:
                    termDict[wordStem] = tokenid
                    terms[tokenid] = positions
                    tokenid = tokenid + 1
                else:
                    stemid = termDict[wordStem]
                    terms[stemid] = positions
            else:
                stemid = termDict[wordStem]
                postns = terms[stemid]
                postns.add(position)
                terms[stemid] = postns
Пример #28
0
Файл: en.py Проект: th0/test2
            class Stemmer(object):
                def __init__(self):
                    self.stemmer = PyStemmer('porter')

                def stem(self, word):
                    return self.stemmer.stemWord(word)
Пример #29
0
                        os.remove(pathOfFolder + f_name + str(i))
                    else:
                        listOfWords[i] = topOfFile[i].split(':')
                        if listOfWords[i][0] not in heap:
                            heapq.heappush(heap, listOfWords[i][0])
    writeIntoFile(tag_index, pathOfFolder, data, countFinalFile)


#############################################################################

if os.path.exists(os.path.join(absltPthCurrPrgrm, 'stopwords.txt')):
    with open(os.path.join(absltPthCurrPrgrm, 'stopwords.txt'), 'r') as file:
        words = file.read().split('\n')
        # stem the stop word
        for word in words:
            word = ps.stemWord(word)
            if word:
                stopwords[word] = 1
else:
    print("stopwords.txt does not exist in the directory")
    sys.exit()

documentcount = 0
###########################################################################
# parse the documents
for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
    tag_name = strip_tag_name(elem.tag)

    # finished extracting all the text in the page tag.
    if (tag_name == 'page') and (event == 'end'):
        documentcount += 1
Пример #30
0
class StemCorpus(Corpus):
    def __init__(self):
        super().__init__()
        self.stemmer = Stemmer('russian')

    def __getstate__(self):
        return self.word_to_idx, self.idx_to_word

    def __setstate__(self, state):
        self.stemmer = Stemmer('russian')
        self.word_to_idx, self.idx_to_word = state

    def encode_word(self, word):
        stem_form = self.stemmer.stemWord(word.lower())
        return self.word_to_idx.get(stem_form, len(self.idx_to_word) - 1)

    def build(self, sentences, vocabulary_size=50000, log_every=100000):
        print('= Start building vocabulary')
        vocab = defaultdict(int)
        saved_sentences = []
        for i, s in enumerate(sentences, 1):
            line = s.lower().split()
            for tok in line:
                if tok in PUNKT_TAGS:
                    continue
                stem_form = self.stemmer.stemWord(tok.lower())
                vocab[stem_form] += 1
            if i % log_every == 0:
                print('--- Processed {} sentences'.format(i))
            saved_sentences.append(line)

        print('= Built vocabulary with size {}'.format(len(vocab)))
        if vocabulary_size < len(vocab):
            print('= Trim it to {}'.format(vocabulary_size))
        word_freq = list(
            map(itemgetter(0),
                sorted(vocab.items(), key=_freq_sorter, reverse=True)))
        word_freq = word_freq[:vocabulary_size]

        print('Top 10 most frequent words: {}'.format(', '.join(
            word_freq[:10])))
        print('Top 10 least frequent words: {}'.format(', '.join(
            word_freq[-10:])))

        print('= Building word to index mapping')
        if Tag.NUM not in word_freq:
            word_freq[-2] = Tag.NUM

        if Tag.ENG not in word_freq:
            word_freq[-1] = Tag.ENG

        assert Tag.EOS not in word_freq
        word_freq.append(Tag.EOS)

        assert Tag.UNK not in word_freq
        word_freq.append(Tag.UNK)

        self.idx_to_word.clear()
        self.word_to_idx.clear()
        for w in word_freq:
            self.word_to_idx[w] = len(self.idx_to_word)
            self.idx_to_word.append(w)

        print('= Built mappings')
        print('idx_to_word size = {}, word_to_idx size = {}'.format(
            len(self.idx_to_word), len(self.word_to_idx)))
Пример #31
0
class Searcher:

    def __init__(self):
        self.lexicon = {}            #lexicon for assisting in search
        self.titles = {}                #document titles
        self.stop_words = {}
        self.stemmer = Stemmer("english")  # for stemming of words
        self.totalDocs = 127467                     # total counts of all pages found in our document ( please update this count according to your dataset)
        self.load()

# loading all the files and writing them to respective dictionaries
    def load(self):
        self.loadLexicon("Lexicon.txt")
        self.loadTitles("titles.txt")
        self.loadStopWords("Stop_words.txt")

    def loadLexicon(self, path):
        try:
            lexiconFile = open(path, 'r')
            for line in lexiconFile:
                x = line[:-1].split("-")
                self.lexicon[int(x[0])] = int(x[1])
        except:
            print("Error opening lexicon file")
            sys.exit(0)

    def loadTitles(self , path):
        try:
            titleFile = open(path, 'r',encoding="UTF-8")
            for line in titleFile:
                x= line[:-1].split("-")
                self.titles[int(x[0])] = x[1]
        except:
            print("Error opening titles file")
            sys.exit(0)

    def loadStopWords(self, path):
        try:
            stop_words_file = open(path, 'r')
            content = stop_words_file.read()
            content = re.split(",", content)
            for word in content:
                if word:
                    self.stop_words[word] = True
        except:
            print("Error opening stop words file")
            sys.exit(0)

    # method to intersect lists
    def intersectLists(self, lists):
        if len(lists) == 0:
            return []
        # start intersecting from the smaller list
        lists.sort(key=len)
        c = lists[0]
        for x in lists[1:]:
            c = list(set(c) & set(x))
        return c

    #method to get document titles for document ids
    def getDocTitles(self, docIds):
        docTitles = []
        for y in docIds:
            title = self.titles.get(y)
            if title != None:
                docTitles.append(title)
        return  docTitles

#method to process and organize raw hitlist from index
    def processRawHitlist(self, hitlists):
        parentArr=[]                            # master array to contain all categories of hits
        parentArr.append({})               # title hits dictionary
        parentArr.append({})               # subTitle hits dictionary
        parentArr.append({})                # category hits dictionay
        parentArr.append({})                # text hit dictionary

        # splitting the hitlist and recording hits in parentArr
        hitlists = hitlists[:-1].split("|")[1].split("/")
        for singleList in hitlists:
            singleDocumentList = singleList.split("-")
            docId = int(singleDocumentList[0])
            for smallerLists in  singleDocumentList[1].split(","):
                a = smallerLists.split(".")
                type = int(a[0])     # the category or type of hit(title, text,etc)
                pos = int(a[1])
                if docId not in parentArr[type]:
                    parentArr[type][docId] = [pos]
                else:
                    parentArr[type][docId].append(pos)
        return parentArr

# method to return final processed hitlist of a word
    def getHitlist(self, word):
        word = word.lower()
        wordId = zlib.crc32(word.encode("UTF-8"))                    # getting word id
        bWordId = wordId.to_bytes(4, byteorder="big", signed=False)
        self.word_file = bWordId[0]             # getting the file containing the word
        off = self.lexicon.get(wordId)          # getting word pointer in index
        if off != None:
            f = open("SortedIndex/" + str(bWordId[0]) + ".txt", "r")
            f.seek(off)
            y = f.readline()              # reading raw hitlist
            f.close()
            return self.processRawHitlist(y)

        else:
            return []


# method to return all docs that contain the given words without catering for proximity
    def getUnproximatedDocs(self, wordsList,type):

        docIds = []
        for arr in wordsList:
            if arr != []:
                arr = arr[type]
                docIds.append(arr.keys())
        docs = self.intersectLists(docIds)
        return docs

# method to do return phrase results in a particular type
    def getResultsForPhrase(self,wordsList, type):
        termDocsCount = 0
        docs = {}
        unproximatedDocs = self.getUnproximatedDocs(wordsList,type)
        # converting unproximated docs to proximated
        for docId in unproximatedDocs:
            proximityArr = []
            for i, arr in enumerate(wordsList):
                if arr != []:
                    arr = arr[type]                     #getting hitlist for a particular type of hits
                    poss = arr.get(docId)
                    proximityArr.append([pos - i for pos in poss])      # subtracting n from positions of a word in document to bring them on a common line
            t = self.intersectLists(proximityArr)                              #intersecting positions to find phrases in documents
            if (len(t) > 0):
                tf = len(t)                                                                        # term frequency would be the length of insersection result
                docs[docId] = tf                                                            #recording term frequency of each document
                termDocsCount +=1
        if type == 3:
            return self.rankDocs(docs,termDocsCount)                   # ranking the results
        else:
            return docs.keys()


    def rankDocs(self,docs,termDocsCount):
        y = docs
        #calculating the inverse document frequency
        if termDocsCount != 0:
            x = self.totalDocs/termDocsCount
            ifd = math.log2(x)

        # calculating tf-idf score for each document
            for x in docs.keys():
                docs[x] = docs[x]*ifd

        # returning sorted array based of tf-tdf values of documents
            x = sorted(docs.items(), key=lambda kv: kv[1],reverse=True)
            y = [a[0] for a in x]

        return y



# append two arrays
    def appendResults(self,results,moreResults):

        for x in moreResults:
            if x not in results:
                results.append(x)
        return results


# get results for a single word query
    def getResultsForWord(self,wordHitlist,type):

        docs={}
        typeArr = wordHitlist[type]                              #getting hitlists for particular type of hit
        for doc in typeArr:
            docs[doc] = len(typeArr.get(doc))                # recording tf of documents relative to query term
        termDocsCount = len(typeArr)
        if type == 3 and termDocsCount != 0:                #ranking title,categories,subtitles has no significant benefits
            x = self.totalDocs / termDocsCount
            ifd = math.log2(x)
            for x in docs.keys():
                docs[x] = docs[x] * ifd                            #finding tf- idf scores
            x = sorted(docs.items(), key=lambda kv: kv[1], reverse=True)       # sorting by tf-idf scores
            y = [a[0] for a in x]
            return y
        else:
            return [x for x in docs.keys()]


# method to do single word query on words of a phrase query and return results with a certain order
    def  getMoreResults(self,singleWordResults , count):

        docs = []
        for type in range(4):
            a = []
            maxCount = -1
            for i in range(len(singleWordResults)):
                v = singleWordResults[i].get(type)
                if v == None:
                    continue
                if len(v) > maxCount:
                    maxCount = len(v)
                a.append(v)

            for j in range(maxCount):
                for i in range(len(a)):
                    if j < len(a[i]):
                        docs.append(a[i][j])
                        if len(docs) == count:
                            return docs



# method to do one word query
    def oneWordQuery(self,word,mode):
        hitlist = self.getHitlist(word)
        if hitlist == []:
            return {}
        else:
            if mode:             # either an atomic single word query or a single word query on terms of a phrase word query
                results = {}   #dictionary results for query on terms of phrase query to allow order
                titleDoc =   self.getResultsForWord(hitlist,0)
                results[0] = titleDoc # results for title hits

                subTitleDoc = self.getResultsForWord(hitlist,1)
                results[1] = subTitleDoc # results for sub title hits

                categoryDoc = self.getResultsForWord(hitlist, 2)
                results[2] = categoryDoc    # results for category hits

                textDoc = self.getResultsForWord(hitlist,3)
                results[3] = textDoc    #results for text hits

                return results

            else:
                results = []  # array results for query on an atomic single word
                titleDoc = self.getResultsForWord(hitlist, 0)
                results = self.appendResults(results,titleDoc)

                subTitleDoc = self.getResultsForWord(hitlist, 1)
                results = self.appendResults(results,subTitleDoc)

                categoryDoc = self.getResultsForWord(hitlist, 2)
                results = self.appendResults(results,categoryDoc)

                textDoc = self.getResultsForWord(hitlist, 3)
                results = self.appendResults(results,textDoc)

                return results






#method to do a phrase query
    def phraseQuery(self,words):

        results = []
        wordsList=[]
        for word in words:
            wordsList.append(self.getHitlist(word))

        titleDoc =   self.getResultsForPhrase(wordsList,0)   # title hit results
        results = self.appendResults(results ,titleDoc)
        subTitleDoc = self.getResultsForPhrase(wordsList,1) # subtitle hit results
        results = self.appendResults(results,subTitleDoc)

        categoryDoc = self.getResultsForPhrase(wordsList, 2) # category hit results
        results =self.appendResults(results,categoryDoc)

        textDoc = self.getResultsForPhrase(wordsList,3) # text hit results
        results =self.appendResults(results,textDoc)

# doing a query on terms of a phrase with limit of 300 more results
        if len(results) < 300:
            singleWordResults = []
            for word in words:
                singleWordResults.append(self.oneWordQuery(word,True))
            y = 50 - len(results)
            x = self.getMoreResults(singleWordResults,y)
            if x != None:
                results += x

        return results





# parent query method and classifier
    def doQuery(self,words):

        results = {}
        queryWord=[]
        words = words.strip().split(" ")
        for word in words:
            word = word.lower()
            word = self.stemmer.stemWord(word)
            if word not in self.stop_words:
                queryWord.append(word)

        if len(queryWord) == 0:
            return {}

        elif len(queryWord) > 1:
            docIds = self.phraseQuery(queryWord)
            for id in docIds:
                results[id] = self.titles.get(id)

        else:
            docIds = self.oneWordQuery(queryWord[0], False)
            for id in docIds:
                results[id] = self.titles.get(id)

        return results
def stemming(data): 
	stemmer = Stemmer("english")
	stemmedData = [stemmer.stemWord(key) for key in data]
	return stemmedData
Пример #33
0
print("Do u want to query ? (y/n) ")
c = raw_input()

while c[0] == 'y':
    print(" What is your phrase query ? ")
    words = raw_input().strip('\n').lower()
    clock_start = time.time()
    docs = defaultdict(float)
    #t is for title
    #p is for text
    #c is for category
    words = words.split(' ')
    for word in words:
        if (':' in word):
            word = word.split(':')
            word[1] = ps.stemWord(word[1])
            if (word[0] == 't'):
                query_with_tag(word[1], 0)
            elif (word[0] == 'p'):
                query_with_tag(word[1], 1)
            elif (word[0] == 'c'):
                query_with_tag(word[1], 2)
            else:
                query_without_tag(word[1])
        else:
            word = ps.stemWord(word)
            query_without_tag(word)
    relevance_ranking()
    print("Query time = " + str(time.time() - clock_start))
    print("Do u want to query ? (y/n) ")
    c = raw_input()
Пример #34
0
index=0

for line in f:
	print index
	data=data+line[:-1]+". "
	count=count+1
	if count%20==0:
		words=[]
		nouns=[]
		for t in data.split():
			#print nltk.tag.str2tuple(t)[1]
			try:
				if nltk.tag.str2tuple(t)[1][0]=='N':
					#no_of_nouns=no_of_nouns+1
					#n_n[stem.stemWord(nltk.tag.str2tuple(t)[0].lower())]=1
					nouns.append(stem.stemWord(nltk.tag.str2tuple(t)[0].lower()))
			except:
				g=1
			words.append(nltk.tag.str2tuple(t)[0].lower())

		data=""
		train_set=[]
		train_labels=[]
		
		for i in range(0,len(words)-WINDOW):
			temp=[]
			for j in range(i,i+WINDOW):
				temp.append(words[j].lower())
			if stem.stemWord(words[i+WINDOW].lower()) not in nouns:
				temp=[]
				continue
Пример #35
0
                    exit(0)
                else:
                    import pdb
                    pdb.set_trace()

            try:
                annlist = annotations['annotations']
                for ann in annlist:
                    start = ann['start']
                    end = ann['end']
                    if start == 0:
                        if end == len(name):
                            wiki_match, confidence = wikipedia_match(ann)
                        else:
                            title = ann['title']
                            name_words = set([stemmer.stemWord(word.lower())
                                              for word in title.split()
                                              if word not in stopwords
                                              ])
                            title_words = set([stemmer.stemWord(word.lower())
                                               for word in title.split()
                                               if word not in stopwords
                                               ])
                            if name_words == title_words:
                                wiki_match, confidence = wikipedia_match(ann)

            except Exception as e:
                import pdb
                pdb.set_trace()

            if wiki_match:
Пример #36
0
for line in f:
    print correct, incorrect
    print index
    data = data + line[:-1] + ". "
    count = count + 1
    if count % 20 == 0:
        words = []
        nouns = []
        for t in data.split():
            #print nltk.tag.str2tuple(t)[1]
            try:
                if nltk.tag.str2tuple(t)[1][0] == 'N':
                    #no_of_nouns=no_of_nouns+1
                    #n_n[stem.stemWord(nltk.tag.str2tuple(t)[0].lower())]=1
                    nouns.append(
                        stem.stemWord(nltk.tag.str2tuple(t)[0].lower()))
            except:
                g = 1
            words.append(nltk.tag.str2tuple(t)[0].lower())

        data = ""
        train_set = []
        train_labels = []

        for i in range(0, len(words)):
            train_set.append(words[i].lower())
            #train_labels.append(stem.stemWord(words[i+WINDOW].lower()))
        model = Word2Vec(train_set, min_count=1)
        #print train_set
        sum_ele = 0
        max_sum = -10
Пример #37
0
class stemming:
    def __init__(self, language='english'):
        self.stemmer = Stemmer(language)

    def __call__(self, content):
        return [self.stemmer.stemWord(word) for word in content]
Пример #38
0
        new_string = document_titles.readline().strip()
        output += new_string + "\n"
    print(output)
    # with open(outTxtFlPth, 'a+') as f:
    # 	print(output, file=f)


###########################################################################

# make a list of all the stopwords.
if os.path.exists(os.path.join(absltPthCurrPrgrm, 'stopwords.txt')):
    with open(os.path.join(absltPthCurrPrgrm, 'stopwords.txt'), 'r') as file:
        words = file.read().split('\n')
        # stem the stop word
        for word in words:
            word = ps.stemWord(word)
            if word:
                stopwords[word] = 1
else:
    print("stopwords.txt does not exist in the directory")
    sys.exit()

for i in range(3):
    mapping[i] = index_term_mapping(i)

create_offset()

##########################################################################

# get queries into a list
# with open(qryTxtFlPth) as f:
Пример #39
0
def stemmer(listofTokens):                                          #Stemming
  stemmer=Stemmer("english")
  stemmedWords=[ stemmer.stemWord(key) for key in listofTokens ]
  return stemmedWords
        #getting references
        ref = find_between(text, "eferences==", "==") + find_between(
            text, "eferences ==", "==")
        text = text.replace(ref, '')

        #clearing up the dictionary, and working on each field
        article_dict = {}

        #TITLE
        field_tokens = []
        title = re.sub('[^A-Za-z]', ' ', title)
        chunk = nltk.word_tokenize(title.lower())
        stopped_tokens = [i for i in chunk if not i in stop_words]
        for i in stopped_tokens:
            try:
                field_tokens.append(p_stemmer.stemWord(i))
            except:
                field_tokens.append(i)
        for i in field_tokens:
            if i in article_dict.keys():
                freq = int(find_between(article_dict[i], "(", ")")) + 1
                if "T" in article_dict[i]:
                    article_dict[i] = find_between(article_dict[i], "",
                                                   "(") + "(%d)" % freq
                else:
                    article_dict[i] = "T" + find_between(
                        article_dict[i], "", "(") + "(%d)" % freq
            else:
                article_dict[i] = "T%d(1)" % count

        #BODY TEXT
Пример #41
0
    'large',
    'database',

    'WSDM',
    'web',
    'search',
    'data',
    'mining',

    'WWW',
    'web'
]
stemmer = Stemmer('english')
new_keywords = set()
for keyword in keywords:
    new_keywords.add(stemmer.stemWord(keyword.lower()))
keywords = new_keywords

conferences = defaultdict(int)


def matches_keywords(title):
    title_keywords = set([stemmer.stemWord(w) for w in title.lower().split()])
    return len(title_keywords.intersection(keywords))


def matches_confs(conf):
    return any([conf.strip() in real_conf for real_conf in confs])

citations_found = 0
with open('arnetminer_full.txt') as f:
Пример #42
0

TGrammar = Dict[str, RuleSet]


def validate_grammar(grammar: TGrammar):
    for ruleset in grammar.values():
        for rule in ruleset.rules:
            for t in rule.tokens:
                if isinstance(t, RefToken):
                    assert t.target in grammar, f'Invalid target {t.target} ' \
                                                f'in rule: \'{to_string(rule.tokens)}\' ' \
                                                f'in ruleset \'{ruleset.name}\''


tokenizer = Tokenizer(text_postprocessing_fn=lambda t: STEMMER.stemWord(t))


def tokenize(text: str, parameters: bool) -> tuple[Token]:
    return tokenizer.tokenize(text)


def grammar_from_dict(data: dict) -> TGrammar:
    grammar = {}
    for key, ruleset_raw in data.items():
        rules = []
        for raw_rule in ruleset_raw:
            if isinstance(raw_rule, str):
                rules.append(Rule(tokens=tokenize(raw_rule, parameters=True)))
            elif isinstance(raw_rule, dict):
                for k, v in raw_rule.items():
Пример #43
0
						listofWords[i]=topofFile[i].split(':')
						if listofWords[i][0] not in heap:
							heapq.heappush(heap,listofWords[i][0])
		if count == 100000:
			print("100000exceeded")
			writeintofile(tagno,path,data)
			data = defaultdict(list)
			count = 0
	if count > 0 : 
		writeintofile(tagno,path,data)
		data = defaultdict(list)
				 
with open('stopwords.txt','r') as file :#reading the stopwords (words to be ignored)
	words = file.read().split('\n')#putting the stopwords into a list "words"
	for x in words:
		x = ps.stemWord(x)#stem the stopwords
		stopwords[x]=1

doccnt = 0	
docno = 0	
for event,element in ET.iterparse(XMLLOC,events=("start","end")):
	chop = element.tag
	idx = chop.rfind("}")#it is a namespace actually, and we use namespaces because for eg if we have a field ID both in
	#as well as teacher class then we use a namespace like a link in XML. But we need the end portion after that namespace
	if idx != -1 :
		chop = chop[idx+1:]
	if chop == 'page' and event == 'end':
		#this is the code for a particular page
		for w in wordsdict :
			for t in tags : 
				if cnt[tags[t]][w] > 0 :
def stemmer(listofTokens):  #Stemming
    stemmer = Stemmer("english")
    stemmedWords = [stemmer.stemWord(key) for key in listofTokens]
    return stemmedWords
Пример #45
0
class Searcher(object):
    """Run a search on documents or objects within documents
    in the SQLite table
    Three scoring options are available: Frequency, TF-IDF and BM25
    Two methods of incrementing the scores of results are available:
    simple addition or best score"""
    
    
    def __init__(self, query, db, doc_level_search=True, stemmer=False, path='/var/lib/philologic/databases/'):
        self.path = path + db + '/'
        self.words = query.split()
        self.doc_level_search = doc_level_search
        self.results = {}
        if doc_level_search:
             self.doc_path = self.path + 'doc_arrays/'
        else:
            self.doc_path = self.path + 'obj_arrays/'
        self.stemmer = stemmer
        if stemmer:
            try:
                from Stemmer import Stemmer
                self.stemmer = Stemmer(stemmer) # where stemmer is the language selected
                self.words = [self.stemmer.stemWord(word) for word in self.words]
            except KeyError:
                print >> sys.stderr, "Language not supported by stemmer. No stemming will be done."
            except ImportError:
                print >> sys.stderr, "PyStemmer is not installed on your system. No stemming will be done."            
        
    def get_hits(self, word, doc=True):
        """Query the SQLite table and return a list of tuples containing the results"""
        cursor = sqlite_conn(self.path + 'hits_per_word.sqlite')
        if self.doc_level_search:
            cursor.execute('select doc_id, word_freq, total_words from doc_hits where word=?', (word,))
        else:
            cursor.execute('select obj_id, word_freq, total_words from obj_hits where word=?', (word,))
        return cursor.fetchall()
        
    def id_to_word(self, id):
        """Return the word given its ID"""
        m = mapper(self.path)
        return m[id]
        
    def get_idf(self, hits):
        """Return IDF score"""
        total_docs = doc_counter(self.doc_path) #### WRONG COUNT
        try:
            return log(float(total_docs) / float(len(hits))) + 1
        except ZeroDivisionError:
            return 0
               
    def search(self, measure='tf_idf', scoring='simple_scoring', intersect=False, display=10):
        """Searcher function"""
        self.intersect = False
        if self.words != []:
            for word in self.words:
                hits = self.get_hits(word)
                getattr(self, measure)(hits, scoring)
                if intersect:
                    if self.intersect:
                        self.docs = self.docs.intersection(self.new_docs)
                        self.new_docs = set([])
                    else:
                        self.intersect = True
                        self.docs = set([obj_id for obj_id in self.results])
                        self.new_docs = set([])
            if intersect:
                self.results = dict([(obj_id, self.results[obj_id]) for obj_id in self.results if obj_id in self.docs])
            return sorted(self.results.iteritems(), key=itemgetter(1), reverse=True)[:display]
        else:
            return []
    
    def debug_score(self, hits, scoring):
        for obj_id, word_freq, word_sum in hits:
            getattr(self, scoring)(obj_id, word_freq)
    
    def tf_idf(self, hits, scoring):
        idf = self.get_idf(hits)
        for obj_id, word_freq, word_sum in hits:
            tf = float(word_freq) / float(word_sum)
            score = tf * idf
            getattr(self, scoring)(obj_id, score)
                    
    def frequency(self, hits, scoring):
        for obj_id, word_freq, word_sum in hits:
            score = float(word_freq) / float(word_sum)
            getattr(self, scoring)(obj_id, score)
                    
    def bm25(self, hits, scoring, k1=1.2, b=0.75):
        ## a floor is applied to normalized length of doc
        ## in order to diminish the importance of small docs
        ## see http://xapian.org/docs/bm25.html
        idf = self.get_idf(hits)
        avg_dl = avg_doc_length(self.path)
        for obj_id, word_freq, obj_length in hits:
            tf = float(word_freq)
            dl = float(obj_length)
            temp_score = tf * (k1 + 1.0)
            temp_score2 = tf + k1 * ((1.0 - b) + b * floor(dl / avg_dl))
            score = idf * temp_score / temp_score2
            getattr(self, scoring)(obj_id, score)
                    
    def simple_scoring(self, obj_id, score):
        if self.intersect:
            self.new_docs.add(obj_id)
        if obj_id not in self.results:
            self.results[obj_id] = score
        else:
            self.results[obj_id] += score
    
    def dismax_scoring(self, obj_id, score):
        if self.intersect:
            self.new_docs.add(obj_id)
        if obj_id not in self.results:
            self.results[obj_id] = score
        else:
            if score > self.results[obj_id]:
                self.results[obj_id] = score
                
    def lda_search(self, measure='tf_idf', scoring='simple_scoring', intersect=False, display=10):
        """Searcher function"""
        self.intersect = False
        self.words = [words.decode('utf-8') for words in self.words]
        if self.words != []:
            lda_query = self.match_topic()
            if lda_query != None:
                for word in self.words[:1]:  # temporary slice, to offer it as an option?
                    lda_query[word] = sum([lda_query[term] for term in lda_query])
                print lda_query
                self.num_hits = {}
                for other_word, freq in lda_query.iteritems():
                    hits = self.get_hits(other_word)
                    results = self.lda_scoring(hits, scoring, freq, measure)
                self.results = dict([(obj_id, self.results[obj_id] * self.num_hits[obj_id]) for obj_id in self.results if self.num_hits[obj_id] > 1])
                return sorted(self.results.iteritems(), key=itemgetter(1), reverse=True)[:display]
            else:
                return []
        else:
            return []
            
    def match_topic(self):
        topic_id = int
        cursor = sqlite_conn(self.path + 'lda_topics.sqlite')
        if len(self.words) == 1:
            cursor.execute('select topic, position from word_position where word=? order by position', (self.words[0],))
            try:
                topic_id = cursor.fetchone()[0]
            except TypeError:
                return None
        else:
            topic_pos = {}
            topic_matches = {}
            query = 'select topic, position from word_position where word="%s"' % self.words[0]
            for word in self.words[1:]:
                query += ' or word="%s"' % word
            cursor.execute(query)
            for topic, position in cursor.fetchall():
                if topic not in topic_pos:
                    topic_pos[topic] = position
                    topic_matches[topic] = 1
                else:
                    topic_pos[topic] += position
                    topic_matches[topic] += 1
            word_num = len(self.words)
            topics = [(topic, topic_pos[topic]) for topic in topic_pos if topic_matches[topic] == word_num]
            if topics == []:
                topics = [(topic, topic_pos[topic]) for topic in topic_pos if topic_matches[topic] == word_num - 1]
            topic_id = sorted(topics, key=itemgetter(1))[0][0]
        cursor.execute('select words from topics where topic=?', (topic_id,))
        results = json.loads(cursor.fetchone()[0])
        topic = [(term, float(freq)) for term, freq in results.iteritems()]# if float(freq) > 0.01]
        topic = dict(sorted(topic, key=itemgetter(1), reverse=True)[:10])
        return topic
        
    def lda_scoring(self, hits, scoring, freq, measure):
        if measure == 'tf_idf':
            idf = self.get_idf(hits)
            for obj_id, word_freq, word_sum in hits:
                tf = float(word_freq) / float(word_sum)
                score = tf * idf * freq
                if obj_id not in self.results:
                    self.results[obj_id] = score
                    self.num_hits[obj_id] = 1
                else:
                    self.results[obj_id] += score    
                    self.num_hits[obj_id] += 1
        else:
            idf = self.get_idf(hits)
            avg_dl = avg_doc_length(self.path)
            k1 = 1.2
            b = 0.75
            for obj_id, word_freq, obj_length in hits:
                tf = float(word_freq)
                dl = float(obj_length)
                temp_score = tf * (k1 + 1.0)
                temp_score2 = tf + k1 * ((1.0 - b) + b * floor(dl / avg_dl))
                score = idf * temp_score / temp_score2 * freq
                if obj_id not in self.results:
                    self.results[obj_id] = score
                    self.num_hits[obj_id] = 1
                else:
                    self.results[obj_id] += score    
                    self.num_hits[obj_id] += 1
Пример #46
0
    porter = PorterStemmer()
    snowball = SnowballStemmer("english")
    isri = ISRIStemmer()
    rslp = RSLPStemmer()
    porter2 = Stemmer('english')

    endOfString = StringEnd()
    prefix = oneOf(
        "uni inter intro de con com anti pre pro per an ab ad af ac at as re in im ex en em un dis over sub syn out thermo philo geo for fore back"
    )
    suffix = oneOf("ish")
    #suffix = oneOf("or er ed ish ian ary ation tion al ing ible able ate ly ment ism ous ness ent ic ive "
    #               "ative tude ence ance ise ant age cide ium ion")

    word = (Optional(prefix)("prefixes") +
            SkipTo(suffix | suffix + FollowedBy(endOfString)
                   | endOfString)("root") +
            ZeroOrMore(suffix | suffix + FollowedBy(endOfString))("suffix"))
    #word = (Optional(prefix)("prefixes") + SkipTo(FollowedBy(endOfString))("root"))

    for wd in wordlist:
        print wd
        stem = lanster.stem(wd)
        print "LansterStemmer:" + stem
        print "PorterStemmer2:" + porter2.stemWord(wd)
        #res = word.parseString(stem)
        #print res.dump()
        #print

finally:
    file.close()
def process_text(text, stemming=True):
    words = _tokenize(text)
    if not stemming:
        return words
    stemmer = Stemmer('english')
    return [stemmer.stemWord(word) for word in words]