Пример #1
0
 def find_all(self):
     cursor = self.find()
     software = cursor.distinct('software_name')
     os = cursor.distinct('os_name')
     company = cursor.distinct('company_name')
     cursor.close()
     return flatten(flatten(software,os),company)
Пример #2
0
def compare(matrix, possible_matrix_list):
    cost = []
    for possible_matrix in possible_matrix_list:
        matrix_cost = 0
        possible_matrix = flatten(possible_matrix)
        final_matrix = flatten(matrix)
        for i, j in zip(final_matrix, possible_matrix):
            if i == b:
                pass
            elif i != j:
                matrix_cost += 1
        cost.append(matrix_cost)
    return cost
Пример #3
0
def compare(matrix,possible_matrix):
    cost=[]
    for possible in possible_matrix:
        mat_cost=0
        possible=flatten(possible)
        matrix=flatten(matrix)
        for i,j in zip(matrix,possible):
            if i==b:
                pass
            elif(i!=j):
                mat_cost+=1
        cost.append(mat_cost)
    return cost
 def __iter__(self):
     n=len(self.uniblocks)
     indreorder=torch.randperm(n).tolist()
     batch=[]
     for idx in indreorder:
         indnew=list(range(idx*self.blocksize,(idx+1)*self.blocksize))
         batch.append(indnew)
         if len(batch)==self.nblock: ##number of block in each minibatch
             yield flatten(batch)
             batch=[]
     
     if len(batch) > 0:
         yield flatten(batch)
def extract_zip_code(texts):
    '''
    Concat the address without zip code and the zip code from zip_code_submethod method.
    Args:
        texts (string): addresses with and without seprated by delimeter.       

    Returns:
        concat_address (string) : address concat with delimeter
        concat_postal_code (string) : concat list of zipcode    
    '''

    delimeter = '|'
    address_list, postal_code_list = list(), list()

    if str(texts) != 'nan' and texts != 'N/A' and texts != None:
        if delimeter in texts:
            for text in texts.split(delimeter):
                address_list_val, postal_code_val = zip_code_submethod(
                    Constants.zip_code_regex, text)
                address_list.append(address_list_val)
                postal_code_list.append(postal_code_val)
        else:
            postal_code = re.findall(Constants.zip_code_regex, texts)
            address_list_val, postal_code_val = zip_code_submethod(
                Constants.zip_code_regex, texts)
            address_list.append(address_list_val)
            postal_code_list.append(postal_code_val)

        concat_address = delimeter.join(address_list)
        concat_postal_code = flatten(postal_code_list)

        return concat_address, concat_postal_code
Пример #6
0
def process_natural_text(body):
    toks_ = [[
        l.strip() for l in re.findall(CODE_TOKENISATION_REGEX, line.strip())
        if len(l.strip()) > 0
    ] for line in body.split('\n')]
    toks_ = flatten(toks_)
    return [(w, t, 0) for w, t in pos_tag(toks_, tagset='universal')]
Пример #7
0
def getSortedCountMap(col, rx):
    backupColName = 'backup_' + col
    train[backupColName] = train[col].apply(lambda x: toJson(x, rx)
                                            if type(x) is str else [])
    sortedList = sorted(flatten(list(train[backupColName].values)))
    countMap = [(i, len(list(c))) for i, c in groupby(sortedList)]
    return sorted(countMap, key=lambda x: x[1], reverse=1)
Пример #8
0
    def preprocess(self,text):   
        check = re.search(r'([a-z])\1+',text)
        if check:
            if len(check.group())>2:
                text = re.sub(r'([a-z])\1+', lambda m: m.group(1), text, flags=re.IGNORECASE) #remove các ký tự kéo dài như hayyy,ngonnnn...

        text = text.strip() #loại dấu cách đầu câu
        text = text.lower() #chuyển tất cả thành chữ thường

        text = re.sub('< a class.+</a>',' ',text)

        for k, v in self.replace_list.items():       #replace các từ có trong replace_list
            text = text.replace(k, v)       

        text = re.sub(r'_',' ',text)  
        
        text = ' '.join(i for i in flatten(tokenize(text).split(" ")))             #gán từ ghép
        
        for i in self.Pos_list:                                    #thêm feature positive
            if re.search(' '+i+' ',text): 
                text = re.sub(i,i+' positive ',text)
        for i in self.Neg_list:                                    #thêm feature negative
            if re.search(' '+i+' ',text):
                text = re.sub(i,i+' negative ',text)
        return text
Пример #9
0
def get_traindata(input_files):
    if (len(input_files) == 0):
        raise Exception('empty values')
    list_filedata = []
    list_filepaths = []
    #print(input_files)
    for eachfile in input_files:
        for filename in eachfile:
            filename = filename
            filepaths = glob.glob(str(filename))
            #print(filepaths)
            list_filepaths.append(filepaths)

    filepaths_list = nltk.flatten(list_filepaths)
    filepaths_list = filepaths_list[:200]
    print('Text files in Train data', len(filepaths_list))
    #print(filepaths_list)
    for filepath in filepaths_list:
        fileopen = open(filepath)
        file_data = fileopen.read()
        file_data = file_data.replace("<br />", " ")
        #file_data = re.sub(r'[^a-zA-Z0-9]'," ",file_data)
        list_filedata.append(file_data)
        fileopen.close()
    #print(list_filedata[0:10])
    return list_filedata
def Update_Output(inputfiles,files_data,outputpath):
    # print((outputpath))
    filenames =[]
    files = nltk.flatten(inputfiles)
    for i in range(len(files)):
        input_files = glob.glob(files[i])
        # print(input_files)
        for j in range(len(input_files)):
            # print(type(input_files[j]))
            if '.txt' in  input_files[j]:
                input_files[j] = input_files[j].replace(".txt", ".redacted.txt")
            if '.md' in input_files[j]:
                input_files[j] = input_files[j].replace(".md", ".redacted.txt")
            if '\\' in input_files[j]:
                input_files[j]= input_files[j].split("\\")
                input_files[j] = input_files[j][1]
                # print(input_files[j])
            filenames.append(input_files[j])

    for i in range(len(files_data)):
        for j in range(len(filenames)):
            if i==j:
                file_data =files_data[i]
                # print((filenames[i]))
                path1 = (os.getcwd())
                # print(outputpath+filenames[j])
                path2 = (outputpath+'/'+filenames[j])
                final_file = open(os.path.join(path1,path2), "w" ,encoding="utf-8")
                # print(os.path.join(path1,path2))
                final_file.write(file_data)
                final_file.close()
    return len(filenames)
Пример #11
0
def get_partial_template(formId, count, key, value):
    s = ''
    for i in value:
        # print(f"{key} - {i}")
        if not i:
            pass
        elif len(i) == 1:  # single <p> paragraph
            count += 1
            s += f"\n\n<p>{get_message_with_affinity(formId, count)}</p>"
        elif isinstance(i[0], str) and i[0].endswith(':'):  # a <ul> list
            if len(i) == 2:
                result, count = get_list_template(i, formId, count, False)
                s += f"\n\n<p>{result}</p>"
            else:
                result, count = get_list_template(i[:2], formId, count, False)
                s += f"\n\n<p>{result}</p>"
                count, ts = get_partial_template(formId, count, key, [i[2:]])
                s += f"{ts}"
        elif i == flatten(i) and index_of_urls(
                i):  # paragraph <p> with <a> links
            count, ts = get_link_template(i, formId, count + 1,
                                          index_of_urls(i))
            if len(index_of_urls(i)) == 1:
                s += f"\n\n<p>@genericLink(params, \"{formId}\", {ts})</p>"
            else:
                s += f"\n\n{ts}"
        else:
            print(
                "ERROR : An unhandled variation was encountered when generating GUIDE PAGE TEMPLATE. This is most likely due to a missing html tag in guide page messages in dfs-frontend."
            )
    return count, s
Пример #12
0
def get_hypernyms(synset):
    hypernyms = set()
    for hyponym in synset.hypernyms():
        hypernyms |= set(get_hypernyms(hyponym))
    result_syns = hypernyms | set(synset.hypernyms())
    result = set(flatten([list(x.lemmas()) if isinstance(x, Synset) else x for x in result_syns]))
    return result
Пример #13
0
    def get_all_tokens(c):
        res = c.copy()
        children = nltk.flatten([token.children for token in c])

        if len(children) > 0:
            res += UDPipeKeywordsExtractor.get_all_tokens(children)

        return res
Пример #14
0
def get_antonyms(synset):
    antonyms = set()
    new_antonyms = set()
    for lemma in synset.lemmas():
        new_antonyms |= set(lemma.antonyms())
        antonyms |= new_antonyms
        for antonym in new_antonyms:
            antonyms |= set(flatten([list(x.lemmas()) for x in antonym.synset().similar_tos()]))
    return antonyms
Пример #15
0
    def get_all_tokens(x):
        res = x.copy()

        children = nltk.flatten([token.children for token in x])

        if len(children) > 0:
            res += UDPipeModel.get_all_tokens(children)

        return res
Пример #16
0
def input(files):
    Read_data = []
    x = []
    for j in files:
        for file in j:
            data = glob.glob(str(file))
            x.append(data)
    y = nltk.flatten(x)
    for i in y:
        Read_data.append(open(i).read())
    return(Read_data)
Пример #17
0
    def evaluate(self, X):
        """
        Evaluate the classifier.
        
        :param X:           data to test on
        :return:            evaluation results
        """
        features, labels = separate_labels_from_features(X)

        # get predictions for data
        y = self.predict(features)

        n_sent_correct = 0
        num_sent = len(y)

        for i in range(len(labels)):
            if labels[i] == y[i]:
                n_sent_correct += 1

        labels = nltk.flatten(labels)
        y = nltk.flatten(y)

        print("F1 score:")
        print(
            sklearn.metrics.precision_recall_fscore_support(labels,
                                                            y,
                                                            average='micro'))
        print()
        print("Accuracy:")
        print(sklearn.metrics.accuracy_score(labels, y))
        print()
        print("Sentence level accuracy:")
        print(n_sent_correct / num_sent)
        print()
        print("F1 score per class:")
        print(sklearn.metrics.precision_recall_fscore_support(labels, y))
        print()
        print("Confusion matrix:")
        cfm = sklearn.metrics.confusion_matrix(labels, y)

        plot_confusion_matrix(cfm, np.unique(labels))
Пример #18
0
def tokenizeSentence(input):
    sentences = [
        re.sub(pattern=r'[\!"#$%&\*+,-./:;<=>?@^_`()|~=]', repl=' ',
               string=x).strip().split(' ') for x in input.split('\n')
        if not x.endswith('writes:')
    ]
    sentences = [x for x in sentences if x != ['']]
    flat_list = flatten(sentences)
    flat_list = [x for x in flat_list if x != '']
    stopwords_german = set(stopwords.words('german'))
    filtered_tokens = [w for w in flat_list if not w in stopwords_german]
    return filtered_tokens
Пример #19
0
    def extract(self, keywords, threshold=3, sampler=Sampler.last.value):
        """
		Extracts the relevant phrases to use for comparison with an incoming query for computing veracity

		keywords (list)		- Query of keywords
		threshold (int)		- Controls number of sentences to return (-1 for all sentences)
		sampler(Sampler) 	- Returns first-n, last-n, random-n or best-n number of sentences (heuristic)

		return (list)		- List of phrases and sentences to use for veracity computation
		"""

        sentences = TextBlob(
            self.body).sentences  # Extract sentences from article body
        relevant = {}
        for sentence in sentences:
            sentence = str(sentence).decode('utf-8').lower().strip()
            tokens = word_tokenize(sentence)
            tokens = [t.lower() for t in tokens if t not in punctuation]

            common = len([value for value in keywords if value in tokens])
            if common == 0: continue
            if common not in relevant.keys(): relevant[common] = []
            relevant[common].append(' '.join(tokens))

        if threshold == -1: return flatten(relevant.values())

        if sampler == Sampler.last.value:
            return flatten(relevant.values())[-threshold:]
        elif sampler == Sampler.first.value:
            return flatten(relevant.values())[:threshold]
        elif sampler == Sampler.random.value:
            shuffle(flatten(relevant.values()))
            return relevant[-threshold:]
        elif sampler == Sampler.best.value:
            result = []
            for key in sorted(relevant.keys(),
                              reverse=True):  # Sort by keyword hits
                result.extend(relevant[key])
                if len(result) >= threshold: break
            return result[:threshold]
def get_pertainyms(synset):
    pertainyms = set()
    new_pertainyms = set()
    for lemma in synset.lemmas():
        new_pertainyms |= set(lemma.pertainyms())
        pertainyms |= new_pertainyms
        for pertainym in new_pertainyms:
            pertainyms |= set(
                flatten([
                    list(x.pertainyms())
                    for x in pertainym.synset().similar_tos()
                ]))
    return pertainyms
Пример #21
0
def generate_guide_template(formId, userType, stats):
    folder = templateUrl + '/app/uk/gov/hmrc/dfstemplaterenderer/templates/guidePageTemplates' + f"/{formId}"
    if not os.path.exists(folder):
        os.mkdir(folder)
        pInfo(f"Folder {formId} created")
    if not os.path.exists(folder + f"/{userType}"):
        os.mkdir(folder + f"/{userType}")
        pInfo(f"Folder {userType} created")
    if os.path.isfile(folder + f"/{userType}/{formId}.scala.html"):
        pWarn('Guide page template already exists')
    else:
        f = open(folder + f"/{userType}/{formId}.scala.html", 'w')
        f.writelines(get_copyright())
        f.writelines([
            "\n\n@import play.twirl.api.Html",
            "\n@import uk.gov.hmrc.dfstemplaterenderer.templates.guidePageTemplates.helpers.genericHelpers.html._",
            "\n@import uk.gov.hmrc.dfstemplaterenderer.utils._",
            "\n@import uk.gov.hmrc.dfstemplaterenderer.models.LinkTemplate",
            "\n\n@(params: Map[String, Any])",
            f"\n\n@baseGenericGuidePageHeader(params, \"{formId}\")"
        ])

        print(f"\n\nstats:{stats}")
        count = 0
        for key, value in stats.items():
            print(f"k = {key},     v = {value}")
            if key == 'list':
                f.write(f"\n\n@noLinkList(params, \"{formId}\", Seq(")
                for i in range(1, len(flatten(value))):
                    if i > 1:
                        f.write(", ")
                    count += 1
                    f.write(f"\"guide.{count:02d}\"")
                f.write("))")
            elif key == 'extraInfo' or key == 'beforeStart':
                if key == 'beforeStart':
                    f.write(
                        f"\n\n@baseGenericGuidePageBody(params, \"{formId}\")")
                count, text = get_partial_template(formId, count, key,
                                                   value[1:])
                f.write(text)
            else:
                pError(
                    "The detected set of messages are not of types: header, list, extraInfo or beforeStart."
                )

        if userType == 'Individual':
            f.write(
                "\n\n<p>@MessagesUtils.getCommonMessages(\"page.guide.youCanTrack\", {params(\"langLocaleCode\")}.toString) <a href=\"@Links.ptaLink\">@MessagesUtils.getCommonMessages(\"abandon.pta.link.msg\", {params(\"langLocaleCode\")}.toString)</a> </p>"
            )
        f.close()
def get_derivationally_related_forms(synset):
    derivationally_related_forms = set()
    new_derivationally_related_forms = set()
    for lemma in synset.lemmas():
        new_derivationally_related_forms |= set(
            lemma.derivationally_related_forms())
        derivationally_related_forms |= derivationally_related_forms
        for derivationally_related_form in derivationally_related_forms:
            derivationally_related_forms |= set(
                flatten([
                    list(x.derivationally_related_forms()) for x in
                    derivationally_related_forms.synset().similar_tos()
                ]))
    return derivationally_related_forms
Пример #23
0
def create_multi_batch(titles, bodies, padding_id, pad_left,
                       generated_questions):
    questions_count = [len(gq) + 1 for gq in generated_questions]

    titles = flatten(zip(titles, generated_questions))
    bodies = flatten([[b] * questions_count[i]
                      for (i, b) in enumerate(bodies)])

    assert len(titles) == len(bodies)
    assert sum(questions_count) == len(titles)

    max_title_len = max(1, max(len(x) for x in titles))
    max_body_len = max(1, max(len(x) for x in bodies))
    if pad_left:
        titles = np.column_stack([
            np.pad(x, (max_title_len - len(x), 0),
                   'constant',
                   constant_values=padding_id) for x in titles
        ])
        bodies = np.column_stack([
            np.pad(x, (max_body_len - len(x), 0),
                   'constant',
                   constant_values=padding_id) for x in bodies
        ])
    else:
        titles = np.column_stack([
            np.pad(x, (0, max_title_len - len(x)),
                   'constant',
                   constant_values=padding_id) for x in titles
        ])
        bodies = np.column_stack([
            np.pad(x, (0, max_body_len - len(x)),
                   'constant',
                   constant_values=padding_id) for x in bodies
        ])

    return titles, bodies, questions_count
Пример #24
0
def wordnet_lookup_xnyms (index_to_tokens, fun):
    xnym_dict = OrderedDict()
    lemma_vocab =  set (porter.stem(word) for word in index_to_tokens.values())

    for token in lemma_vocab:
        xnyms_syns = set()
        for syn in wordnet.synsets(token):
            xnyms_syns |= fun(syn)

        lemmas = set(flatten([list(x.lemmas()) if isinstance(x, Synset) else x for x in xnyms_syns]))

        strings = [split_multi_word(x.name()) for x in lemmas]

        xnym_dict[(token,)] = strings
    return xnym_dict
Пример #25
0
    def pair_spans(self, spans: List[Dict[str,
                                          Any]]) -> List[List[Dict[str, Any]]]:
        indices = sorted(
            list(
                set(
                    flatten([
                        list(range(d['start'], d['end'])) for d in spans
                        if d['able']
                    ]))))
        rs = ranges(indices)
        paired_spans = [[
            d for d in spans
            if d['start'] >= r_start and d['end'] <= r_end and d['able']
        ] for r_start, r_end in rs]

        return paired_spans
Пример #26
0
def output(files, data, name):
    allfiles = []
    for i in files:
        for file in i:
            allfiles.append(glob.glob(file))
    flattenf = nltk.flatten(allfiles)
    newfilepath = os.path.join(os.getcwd(), name)
    for j in range(len(flattenf)):
        getpath = os.path.splitext(flattenf[j])[0]
        getpath = os.path.basename(getpath) + '.redacted'
        if not os.path.exists(newfilepath):
            os.makedirs(newfilepath)
            with open(os.path.join(newfilepath, getpath), 'w') as temp:
                temp.write(data[j])
        elif os.path.exists(newfilepath):
            with open(os.path.join(newfilepath, getpath), 'w') as temp:
                temp.write(data[j])
Пример #27
0
def main():
    papers_df = readData(["pdf_json", "pmc_json"], cfg["data_path"])
    papers_df[
        "abstract_fullText"] = papers_df["abstract"] + papers_df["full_text"]
    papers_df.drop(columns=['title', 'abstract', 'full_text'], inplace=True)

    text_preprocessor = Preprocessor()
    papers_df['abstract_fullText_Cleaned'] = papers_df[
        'abstract_fullText'].map(lambda text: text_preprocessor.
                                 clean_and_tokenize(text,
                                                    stop=False,
                                                    lowercase=True,
                                                    removeUrls=False,
                                                    remove_html=False,
                                                    lemmatize=False,
                                                    remove_numbers=False,
                                                    tokenize=False))
    terms = papers_df['abstract_fullText_Cleaned'].map(
        lambda text: text_preprocessor.clean_and_tokenize(text,
                                                          stop=True,
                                                          lowercase=False,
                                                          removeUrls=True,
                                                          remove_html=True,
                                                          lemmatize=False,
                                                          remove_numbers=True,
                                                          tokenize=True))

    arrayTerm = terms.ravel().tolist()
    flattenedTerms = flatten(arrayTerm)

    ## Create Vocabulary
    vocabulary = set(flattenedTerms)
    vocabulary = list(vocabulary)
    # Intializating the tfIdf model
    tfidf = TfidfVectorizer(vocabulary=vocabulary,
                            dtype=np.float32,
                            min_df=0,
                            max_df=0.8,
                            use_idf=True,
                            smooth_idf=True,
                            sublinear_tf=True)
    # Fit the TfIdf model
    tfidf.fit(papers_df.abstract_fullText_Cleaned)
    dictIDF = dict(zip(tfidf.vocabulary_, tfidf.idf_))
    save_obj(dictIDF, "dictIDF", 'txt')
    print("Dictionary successfully created")
def Reading_input(inputfiles):
    # print(inputfiles)
    files_data = []
    files = nltk.flatten(inputfiles)

    for i in range(len(files)):
        # print(files[i])
        input_files = glob.glob(files[i])
        # print(input_files1)
        for j in range(len(input_files)):
            # print(input_files1[j])
            data = open(input_files[j]).read()
            # print(data)
            files_data.append(data)
    # print(type(files_data))
    # print(len(files_data))
    return files_data
Пример #29
0
def data_input(input_files):
    if (len(input_files) == 0):
        raise Exception('empty values')
    list_filedata = []
    list_filepaths = []
    for eachfile in input_files:
        for filename in eachfile:
            filename = filename
            filepaths = glob.glob(str(filename))
            list_filepaths.append(filepaths)

    filepaths_list = nltk.flatten(list_filepaths)
    for filepath in filepaths_list:
        fileopen = open(filepath)
        file_data = fileopen.read()
        list_filedata.append(file_data)
        fileopen.close()
    return list_filedata
    def run(self, min_wd_cnt = 5, stem = True, spelling_correct = True, folds = 10):

        """ We don't want to remove stop words
        """
        sentences, tagged_sentences = self.load_tagged_sentences()
        processed_sentences = self.process_sentences(sentences, spelling_correct, stem, min_wd_cnt)
        sentence_features = np.asarray( map(self.features_for_sentence, processed_sentences))

        cross_validation_ixs = cross_validation(range(len(sentences)), folds)
        codes = sorted(set(flatten(tagged_sentences)))

        for code in codes:

            code_tags = self.tags_for_code(code, tagged_sentences)

            pass

        pass
Пример #31
0
def file_output(files, data, filename):
    list_files = []
    for i in files:
        for file in i:
            list_files.append(glob.glob(file))
    flattenfiles = nltk.flatten(list_files)
    newfilepath = os.path.join(os.getcwd(), filename)
    j = 0
    while j < len(flattenfiles):
        getpath = os.path.splitext(flattenfiles[j])[0]
        getpath = os.path.basename(getpath) + '.redacted'
        if not os.path.exists(newfilepath):
            os.makedirs(newfilepath)
            with open(os.path.join(newfilepath, getpath), 'w') as outputfile:
                outputfile.write(data[j])
        elif os.path.exists(newfilepath):
            with open(os.path.join(newfilepath, getpath), 'w') as outputfile:
                outputfile.write(data[j])
        j = j + 1
Пример #32
0
def chunks(file):
  f = open(file)
  raw = f.read()

  tokens = nltk.word_tokenize(raw)

  tagged_tokens = nltk.pos_tag(tokens)

  # The patter for this grammar is repeated twice in order to only find noun phrases with two or more words
  grammar = "NP: {<JJ.*>*<NN.*>+ <JJ.*>*<NN.*>+}"

  # Other possible grammars:

  # grammar = "NP: {<DT>?<JJ.*>*<NN.*>+}"
  # grammar = r"""
  # NP: {<DT|PP\$>?<JJ>*<NN>} # chunk determiner/possessive, adjectives and nouns
  # {<NNP>+} # chunk sequences of proper nouns
  # """
  # grammar = r"""
  # NP: {<DT><NN.*><.*>*<NN>}
  #     }<VB.*>{
  # """

  cp = nltk.RegexpParser(grammar)
  chunks_tree = cp.parse(tagged_tokens)

  # The result from the chunk parser is a tree. Here I'm finding all the Noun Phrases subtrees,
  # flattening them into lists, and converting those lists to tuples. This way we end up with the
  # same data structure that we get from pos_tag()
  np_subtrees = list(chunks_tree.subtrees(filter=lambda x: x.node=='NP'))
  flatten_np_subtrees = [tuple(nltk.flatten(t)) for t in np_subtrees]

  result = []
  for item in flatten_np_subtrees:
    noun_phrase = ''
    for n in range(len(item)):
      if n % 2 == 0:
        noun_phrase += item[n]
        noun_phrase += ' '
    result.append((noun_phrase.rstrip(), 'NP'))

  return result
Пример #33
0
def chunks(file):
    f = open(file)
    raw = f.read()

    tokens = nltk.word_tokenize(raw)

    tagged_tokens = nltk.pos_tag(tokens)

    # The patter for this grammar is repeated twice in order to only find noun phrases with two or more words
    grammar = "NP: {<JJ.*>*<NN.*>+ <JJ.*>*<NN.*>+}"

    # Other possible grammars:

    # grammar = "NP: {<DT>?<JJ.*>*<NN.*>+}"
    # grammar = r"""
    # NP: {<DT|PP\$>?<JJ>*<NN>} # chunk determiner/possessive, adjectives and nouns
    # {<NNP>+} # chunk sequences of proper nouns
    # """
    # grammar = r"""
    # NP: {<DT><NN.*><.*>*<NN>}
    #     }<VB.*>{
    # """

    cp = nltk.RegexpParser(grammar)
    chunks_tree = cp.parse(tagged_tokens)

    # The result from the chunk parser is a tree. Here I'm finding all the Noun Phrases subtrees,
    # flattening them into lists, and converting those lists to tuples. This way we end up with the
    # same data structure that we get from pos_tag()
    np_subtrees = list(chunks_tree.subtrees(filter=lambda x: x.node == 'NP'))
    flatten_np_subtrees = [tuple(nltk.flatten(t)) for t in np_subtrees]

    result = []
    for item in flatten_np_subtrees:
        noun_phrase = ''
        for n in range(len(item)):
            if n % 2 == 0:
                noun_phrase += item[n]
                noun_phrase += ' '
        result.append((noun_phrase.rstrip(), 'NP'))

    return result
Пример #34
0
    def top_ten(self):
        from operator import itemgetter
        d = self._group(key='software_name')
        d=flatten(d,self._group('os_name'))

        proper = []
        extras = []
        s = set()
        for map in d:
            proper.append(dict(name=map['os_name'] if 'os_name' in map else map['software_name'], count=map['count']))

        for map in proper[:]:
            if isinstance(map['name'],list):
                extras.append(map)
                proper.remove(map)
            else: s.add(map['name'])

        for map in extras:
            self._aggregate(key='name',
                            map=map,
                            set=s,
                            list=proper)
        return sorted(proper, key=itemgetter('count'), reverse=True)[:10]