Exemplo n.º 1
0
class Entity(object):

    def __init__(self, nlp, keywords_list=[], keywords_dict={}, keywords_file=None, label='', attrs=('has_entities', 'is_entity', 'entity_desc', 'entities')):
        self._has_entities, self._is_entity, self._entity_desc, self._entities = attrs
        self.keyword_processor = KeywordProcessor()
        self.keyword_processor.add_keywords_from_list(keywords_list)
        self.keyword_processor.add_keywords_from_dict(keywords_dict)
        if keywords_file: self.keyword_processor.add_keyword_from_file(keywords_file)
        self.label = label
        Doc.set_extension(self._has_entities, getter=self.has_entities)
        Doc.set_extension(self._entities, getter=self.iter_entities)
        Span.set_extension(self._has_entities, getter=self.has_entities)
        Span.set_extension(self._entities, getter=self.iter_entities)
        Token.set_extension(self._is_entity, default=False)
        Token.set_extension(self._entity_desc, getter=self.get_entity_desc)

    def __call__(self, doc):
        matches = self.keyword_processor.extract_keywords(doc.text, span_info=True)
        spans = []  # keep spans here to merge them later
        for _, start, end in matches:
            entity = doc.char_span(start, end, label=self.label)
            for token in entity: token._.set(self._is_entity, True)
            spans.append(entity)
            doc.ents = list(doc.ents) + [entity]
        for span in spans: span.merge()
        return doc

    def has_entities(self, tokens):
        return any(token._.get(self._is_entity) for token in tokens)

    def iter_entities(self, tokens):
        return [(t.text, i, t._.get(self._entity_desc)) for i, t in enumerate(tokens) if t._.get(self._is_entity)]

    def get_entity_desc(self, token):
        return token.text
Exemplo n.º 2
0
    def __call__(self, from_file=False, file_path=None, stdout=False):
        watchlist = self.open_watchlist()

        keyword_processor = KeywordProcessor()
        keyword_processor.add_keywords_from_list(watchlist)

        if from_file:
            egg_path = _ask_spy_file()
        elif file_path:
            egg_path = file_path
        else:
            raise ValueError("Expected egg, from_file or file_path argument.")

        gui = Scanner(egg_path)

        for line, reaction in enumerate(open(egg_path).readlines(), start=1):
            if reaction and not reaction.startswith('#'):
                match = keyword_processor.extract_keywords(reaction)
                if match:
                    matches = ', '.join(match)
                    reaction = reaction.strip()
                    gui.insert_data(line, matches, reaction)

                    if stdout:
                        log.info(
                            f"Line {line}: {', '.join(match)} \n{reaction}")
Exemplo n.º 3
0
def filter_keywords(data: pd.DataFrame,
                    keywords: Iterable[str],
                    source: Union[str, List[str]] = "text",
                    case_sensitive: bool = True) -> pd.DataFrame:
    """Filters out rows that do not have any keywords in any source column(s)

    Args:
        data: dataframe containing [source] column(s) of type str
        keywords: Iterable of strings to search for in text
        source: column name or names containing the source text
        case_sensitive: Toggle keyword case sensitivity

    Returns:
        Original dataframe with rows filtered out
    """
    # Get keyword processor and add keywords
    proc = KeywordProcessor(case_sensitive=case_sensitive)
    proc.add_keywords_from_list(keywords)
    # If single source column, only need to check one element in each row, otherwise, apply any(..)
    # to check all source columns iteratively through each row
    if isinstance(source, str):
        mask = data[source].apply(
            lambda sent: bool(proc.extract_keywords(sent)))
    else:
        mask = data[source].apply(lambda sents: any(
            bool(proc.extract_keywords(sent)) for sent in sents))
    output = data[mask]  # Use mask to filter out rows without any keywords
    return output
Exemplo n.º 4
0
    def cleanDomain(self):
        L = []
        black = []
        keyword_processor = KeywordProcessor()

        f = open(self.whitePath, "r")
        for i in f.readlines():
            L.append(i.strip('\r\n'))

        keyword_processor.add_keywords_from_list(L)
        try:
            for i in self.param:  #待过滤的domain

                hostname, ip, ipBelong, firstVisitTime, lastestVisitTime, userSet, visitNum, similarityValue, imitate = \
                    i.strip('\r\n').split(";")

                hostname = hostname.strip('\r\n')

                try:
                    # if re.search("\."+j.strip('\r\n')+'$',hostname):
                    if len(keyword_processor.extract_keywords(hostname)) == 0:
                        black.append(i)
                except:
                    traceback.print_exc()

        except:
            traceback.print_exc()
            print("匹配错误")
        finally:
            f.close()
            return black
def keywords(all_needles, case_flag = False):

	all_needles = all_needles[['source_id','needle']].drop_duplicates()
	all_needles['needle'] = all_needles['needle'].str.strip()
	gen_needles = all_needles['needle'].to_list()
	keyword_processor = KeywordProcessor(case_sensitive=case_flag)
	keyword_processor.add_keywords_from_list(gen_needles)
	return keyword_processor
Exemplo n.º 6
0
def flash_text():
    # haystack = "Narendra Narendra Modi is the Prime Minister of India. He was a Chief Minister of Gujarat. Gujarat is a nice place"
    haystack = "sof/vel is a short form for sofosbuvir/velpatasvir"
    # needles = ["Narendra Modi", "Prime Minister", "Gujarat", "Modi", "Narendra"]
    needles = ["sof/vel", "sofosbuvir/velpatasvir"]
    processor = KeywordProcessor()
    processor.add_keywords_from_list(needles)
    found = processor.extract_keywords(haystack)
    print found
Exemplo n.º 7
0
def flashmatch():
    lentities = entities['name'].tolist()
    processor = KeywordProcessor()
    processor.add_keywords_from_list(lentities)
    keywords['entity']=''
    for i in range(len(keywords)):
        keywords['entity'][i]=processor.extract_keywords(keywords['keywords'][i])
    nermatch()
    return(keywords)
Exemplo n.º 8
0
def load_data():
    df = pd.read_csv("data/supplements.csv")
    title_processor = KeywordProcessor()
    e_title_processor = KeywordProcessor()

    title_processor.add_keywords_from_list(list(df["title"].values))
    e_title_processor.add_keywords_from_list(list(df["numeric_title"].values))

    return df, title_processor, e_title_processor
Exemplo n.º 9
0
def load_names_processor():
    # :: Load vocabulary for is_name features ::

    global KEYWORD_PROCESSOR
    from flashtext import KeywordProcessor
    KEYWORD_PROCESSOR = KeywordProcessor()
    KEYWORD_PROCESSOR.add_keywords_from_list(list(load_names(FR_NAMES_PATH).keys()))
    logging.info("Loaded french proper names...")
    return KEYWORD_PROCESSOR
class _AnalyzeDocument(object):

    def __init__(self, document_id, ip_document, dictionary_terms):
        self.document_id = document_id
        self.ip_document = ip_document
        self.dictionary_terms = dictionary_terms
        self.keyword_processor = KeywordProcessor()
        self.keyword_processor.add_keywords_from_list(list(dictionary_terms.keys()))
        self.result = []

    def analyze(self, doc_location):
        for ip_paragraph in self.ip_document.get_paragraphs():
            if ip_paragraph.get_paragraph_index() < doc_location.get_paragraph_index():
                continue

            text = ip_paragraph.get_text()
            match_results = self.keyword_processor.extract_keywords(text, span_info=True)
            if not match_results:
                continue

            for match, start, end in match_results:
                if doc_location.get_paragraph_index() == ip_paragraph.get_paragraph_index():
                    if start < doc_location.get_start_offset():
                        continue
                sbh_uri = self.dictionary_terms[match]
                analyze_result = AnalyzeResult(ip_paragraph.get_paragraph_index(),
                                               match,
                                               sbh_uri,
                                               start,
                                               end-1)
                self.result.append(analyze_result)

    def remove_first_occurrence(self, paragraph_index, matching_term, sbh_uri, start_offset, end_offset):
        for index in reversed(range(len(self.result))):
            analyze_result = self.result[index]
            # if users want to manually enter in a sbh_uri then allow users  to remove current result
            # as long as the term and position where the term occurs in the document matches.
            if (analyze_result.get_paragraph_index() == paragraph_index
                    and analyze_result.get_matching_term() == matching_term
                    and analyze_result.get_start_offset() == start_offset
                    and analyze_result.get_end_offset() == end_offset):
                self.result.pop(index)
                return True
        return False

    def remove_all(self, term):
        removed_item = []
        for index in reversed(range(len(self.result))):
            analyze_result = self.result[index]
            if analyze_result.get_matching_term() == term:
                self.result.pop(index)
                removed_item.append(analyze_result)
        return removed_item

    def get_result(self):
        return self.result
Exemplo n.º 11
0
 def degree_list(self):
     degree_dict ={}
     with open(self.degree_file_path) as fp:
         lines = fp.read().splitlines()
     processor = KeywordProcessor()
     processor.add_keywords_from_list(lines)
     found = processor.extract_keywords(self.document, span_info=True)
     for count, value in enumerate(found):
         line_number = self.line_count(value[1])
         degree_dict[value[0]] = line_number
     return(degree_dict)
Exemplo n.º 12
0
def degree_list(document, line_index_dict):
    degree_dict = {}
    with open("doc_qualification/highest_qualification.txt") as fp:
        lines = fp.read().splitlines()
    processor = KeywordProcessor()
    processor.add_keywords_from_list(lines)
    found = processor.extract_keywords(document, span_info=True)
    for count, value in enumerate(found):
        line_number = line_count(value[1], line_index_dict)
        degree_dict[value[0]] = line_number
    return (degree_dict)
Exemplo n.º 13
0
 def test_list_loading(self):
     keyword_processor = KeywordProcessor()
     keyword_list = ["java", "product management"]
     keyword_processor.add_keywords_from_list(keyword_list)
     sentence = 'I know java and product management'
     keywords_extracted = keyword_processor.extract_keywords(sentence)
     self.assertEqual(keywords_extracted, ['java', 'product management'],
                      "Failed file format one test")
     sentence_new = keyword_processor.replace_keywords(sentence)
     self.assertEqual(sentence_new, "I know java and product management",
                      "Failed file format one test")
Exemplo n.º 14
0
    def tag(self, file, news):
        with open(file, 'rb') as fb:
            keywords = pickle.load(fb)
            keyword_processor = KeywordProcessor()
            keyword_processor.add_keywords_from_list(keywords)

        keywords_found = keyword_processor.extract_keywords(news)
        sets = set()
        for k in keywords_found:
            sets.add(k)

        return sets
Exemplo n.º 15
0
def in_key_words_list(item, key_words_list):

    keyword_processor = KeywordProcessor()

    keyword_processor.add_keywords_from_list(key_words_list)

    found = keyword_processor.extract_keywords(item)

    if found:
        return found, 'Yes'
    else:
        return "No"
Exemplo n.º 16
0
def look_for_imp_messages(message_string, keyword_sets):
    """
    :param message_string: Line of text where we will search for keywords
    :param keyword_sets: list of list of keywords
    :return: keywords_extracted (list(str)): List of terms/keywords found in sentence that match our corpus
    """

    kp = KeywordProcessor()
    for keyword_set in keyword_sets:
        kp.add_keywords_from_list(keyword_set)

    return kp.extract_keywords(message_string)
Exemplo n.º 17
0
class ExactEntityLinking:
    def __init__(self, entities, case_sensitive=False):
        self.linker = KeywordProcessor(case_sensitive=case_sensitive)
        self.n_no_spans = 0
        self.n_overlapping_spans = 0
        self.n_multiple_ents = 0
        logger.info("Case sensitive entities: {}".format(case_sensitive))
        logger.info(
            "Building data structure with flashText for exact match entity linking (|E|={}) ..."
            .format(len(entities)))
        t = time.time()
        self.linker.add_keywords_from_list(list(set(entities)))
        t = (time.time() - t) // 60
        logger.info("Took %d mins" % t)

    def link(self, text):
        spans = sorted([(start_span, end_span) for _, start_span, end_span in
                        self.linker.extract_keywords(text, span_info=True)],
                       key=lambda span: span[0])
        if not spans:
            self.n_no_spans += 1
            return

        # Remove overlapping matches, if any
        filtered_spans = list()
        for i in range(1, len(spans)):
            span_prev, span_next = spans[i - 1], spans[i]
            if span_prev[1] < span_next[0]:
                filtered_spans.append(spans[i])
            else:
                self.n_overlapping_spans += 1
        spans = filtered_spans[:]

        matches_texts = [text[s:e] for s, e in spans]

        # Check if any entity is present more than once, drop this sentence
        counts = collections.Counter(matches_texts)
        skip = False

        for _, count in counts.items():
            if count > 1:
                skip = True
                self.n_multiple_ents += 1
                break
        if skip:
            return

        text2span = {matches_texts[i]: spans[i] for i in range(len(spans))}

        return text2span
Exemplo n.º 18
0
def addIsNameInformation(sentences, keyword_processor=None):
    """Adds information on whether a word is included or not in word dictionary"""

    if keyword_processor is None:
        from flashtext import KeywordProcessor
        keyword_processor = KeywordProcessor()
        keyword_processor.add_keywords_from_list(
            list(load_names(FR_NAMES_PATH).keys()))

    for sentenceIdx in range(len(sentences)):
        sentences[sentenceIdx]['is_name'] = []
        for tokenIdx in range(len(sentences[sentenceIdx]['tokens'])):
            token = sentences[sentenceIdx]['tokens'][tokenIdx]
            sentences[sentenceIdx]['is_name'].append(
                getIsName(keyword_processor, token))
Exemplo n.º 19
0
 def test_extract_keywords_case_sensitive(self):
     """For each of the test case initialize a new KeywordProcessor.
     Add the keywords the test case to KeywordProcessor.
     Extract keywords and check if they match the expected result for the test case.
     """
     for test_id, test_case in enumerate(self.test_cases):
         keyword_processor = KeywordProcessor(case_sensitive=True)
         for key in test_case['keyword_dict']:
             keyword_processor.add_keywords_from_list(test_case['keyword_dict'][key])
         keywords_extracted = keyword_processor.extract_keywords(test_case['sentence'], span_info=True)
         for kwd in keywords_extracted:
             # returned keyword should match the sapn from sentence
             self.assertEqual(
                 kwd[0], test_case['sentence'][kwd[1]:kwd[2]],
                 "keywords span don't match the expected results for test case: {}".format(test_id))
Exemplo n.º 20
0
def find_needles():
    needles = []

    start = time.time()
    db = Database(None)
    print "Connected to DB in", time.time() - start, "secs"

    start = time.time()
    db.execute("SELECT STR FROM `Final_consolidated`")
    print "Retrieved info from DB in", time.time() - start, "secs"

    start = time.time()
    for ele in db.cursor:
        needles.append(ele[0].lower())
    print "Listed needles in", time.time() - start, "secs"

    start = time.time()
    processor = KeywordProcessor()
    processor.add_keywords_from_list(needles)
    print "Processed needles in", time.time() - start, "secs"
    start = time.time()
    file = open("hello.txt", "r")
    for haystack in file.readlines():
        print "Before hyphen removal:"
        found = list(set(processor.extract_keywords(haystack.lower(), span_info=True)))
        string = ""
        for ele in found:
            ele = ele.replace('"', '\\"')
            string += ',"' + ele + '"'
        print string, "\n\n"
        haystack = haystack.replace("-", " ")
        print "After hyphen removal:"
        found = list(set(processor.extract_keywords(haystack.lower())))
        string = ""
        for ele in found:
            ele = ele.replace('"', '\\"')
            string += ',"' + ele + '"'
        print string, "\n\n\n"
    # haystacks = []
    # db.execute("SELECT id, detailed_description FROM `sherlock_ct_new`.`trials`")
    # for row in db.cursor:
    #     if row[1]:
    #         haystack = row[1].lower()
    #         found = list(set(processor.extract_keywords(haystack.lower())))
    #         file.write(str(row[0]) + "," + json.dumps(found))
    file.close()
    time.time() - start
    print "Found needles within haystack in", time.time() - start, "secs"
Exemplo n.º 21
0
class Entity(object):

    name = 'entity'

    def __init__(self,
                 nlp,
                 keywords_list=[],
                 keywords_dict={},
                 keywords_file=None,
                 label='',
                 attrs=('has_entities', 'is_entity', 'entity_desc',
                        'entities')):
        self._has_entities, self._is_entity, self._entity_desc, self._entities = attrs
        self.keyword_processor = KeywordProcessor()
        self.keyword_processor.add_keywords_from_list(keywords_list)
        self.keyword_processor.add_keywords_from_dict(keywords_dict)
        if keywords_file:
            self.keyword_processor.add_keyword_from_file(keywords_file)
        self.label = label
        # Add attributes
        Doc.set_extension(self._has_entities, getter=self.has_entities)
        Doc.set_extension(self._entities, getter=self.iter_entities)
        Span.set_extension(self._has_entities, getter=self.has_entities)
        Span.set_extension(self._entities, getter=self.iter_entities)
        Token.set_extension(self._is_entity, default=False)
        Token.set_extension(self._entity_desc, getter=self.get_entity_desc)

    def __call__(self, doc):
        matches = self.keyword_processor.extract_keywords(doc.text,
                                                          span_info=True)
        for _, start, end in matches:
            entity = doc.char_span(start, end, label=self.label)
            if entity:
                for token in entity:
                    token._.set(self._is_entity, True)
            # Overwrite doc.ents and add entity – be careful not to replace!
            doc.ents = list(doc.ents) + [entity]
        return doc

    def has_entities(self, tokens):
        return any(token._.get(self._is_entity) for token in tokens)

    def iter_entities(self, tokens):
        return [(t.text, i, t._.get(self._entity_desc))
                for i, t in enumerate(tokens) if t._.get(self._is_entity)]

    def get_entity_desc(self, token):
        return token.text
Exemplo n.º 22
0
def keyword_bool_proc(keywords, case_sensitive):
    """Creates a process that returns a boolean value that corresponds to whether or not the passed in text contains
    any of the provided keywords (respective to case sensitivity)

    Args:
        keywords: Iterable of strings to search for in text
        case_sensitive: Toggle keyword case sensitivity

    Returns:
        Callable(str) -> bool that returns True if the string contains a keyword and false otherwise
    """
    proc = KeywordProcessor(case_sensitive=case_sensitive)
    proc.add_keywords_from_list(keywords)

    def bool_proc(text):
        return bool(proc.extract_keywords(text))

    return bool_proc
Exemplo n.º 23
0
def mp_extract_keywords(keywords, sentences, case_sensitive=False):

    corpus = dict()
    kp = KeywordProcessor(case_sensitive=case_sensitive)

    kp.add_keywords_from_list(keywords)

    for sentence in sentences:

        if isinstance(sentence, list):
            sentence = ' '.join(sentence)

        keywords_found = kp.extract_keywords(sentence)

        for keyword in keywords_found:
            corpus.setdefault(keyword, set())
            corpus[keyword].add(sentence)

    return corpus
Exemplo n.º 24
0
def flash_match(text,
                frequencies: FreqDist,
                filename='all_linked_skills.txt') -> SkillSet:
    keyword_processor = KeywordProcessor()
    #with open('list.pkl', 'rb') as fs:
    #    my_list = pickle.load(fs)
    my_list = []
    with open(filename, 'r', encoding='utf-8') as fs:
        for line in fs.readlines():
            skill = line.strip("\n").lower()
            my_list.append(skill)

    keyword_processor.add_keywords_from_list(my_list)
    matches = keyword_processor.extract_keywords(text)
    skill_dictionary = {}
    for match in matches:
        if frequencies[match] > 0:
            skill_dictionary[match] = frequencies[match]
    return SkillSet(skill_dictionary)
def main():
    if len(sys.argv) < 3:
        print("Usage: python RunModel_modified.py modelPath inputPath")
        exit()

    modelPath = sys.argv[1]
    inputPath = sys.argv[2]

    # :: Read input ::
    with open(inputPath, 'r') as f:
        text = f.read()

    # :: Load vocabulary for is_name features ::
    from flashtext import KeywordProcessor
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_list(list(load_names(FR_NAMES_PATH).keys()))

    # :: Load the model ::
    lstmModel = BiLSTM.loadModel(modelPath)

    # :: Prepare the input ::
    pre_treated_lines, _ = pre_treat_text(text)
    tokenized_sentences = tokenize_text(pre_treated_lines)
    sentences = [{'tokens': sent} for sent in tokenized_sentences]
    addCharInformation(sentences)
    addCasingInformation(sentences)
    addIsNameInformation(sentences, keyword_processor=keyword_processor)
    dataMatrix = createMatrices(sentences, lstmModel.mappings, True)

    # :: Tag the input ::
    tags = lstmModel.tagSentences(dataMatrix)

    # :: Output to stdout ::
    for sentenceIdx in range(len(sentences)):
        tokens = sentences[sentenceIdx]['tokens']

        for tokenIdx in range(len(tokens)):
            tokenTags = []
            for modelName in sorted(tags.keys()):
                tokenTags.append(tags[modelName][sentenceIdx][tokenIdx])

            print("%s\t%s" % (tokens[tokenIdx], "\t".join(tokenTags)))
        print("")
Exemplo n.º 26
0
def load_thesaurus(thesaurus_file):
  df = pd.read_csv(thesaurus_file)
  df.fillna('', inplace=True)
  thesaurus = KeywordProcessor()
  thesaurus.add_keywords_from_list(list(df['name'].values))
  
  def use(term):
    u = df[df.name == term]['USE']
    if len(u) == 0 or u.values[0] == '':
        return term
    else:
        return u.values[0]
  
  def transform(txt):
    terms = thesaurus.extract_keywords(txt)
    terms = [use(t) for t in terms]
    return terms
  
  thesaurus.transform = transform
  return thesaurus
Exemplo n.º 27
0
class trie(Datasource):
    """
        Generates a list of strings that match any of a set of provided
        substrings

        :Parameters:
            substrings: `list` ( `str` )
                A list of substrings to find in the text
            text_datasource : :class:`revscoring.Datasource`
                A datasource that returns a `str` or a `list` of `str`
            name : `str`
                A name for the new datasource
    """
    def __init__(self,
                 substrings,
                 text_datasource=None,
                 case_sensitive=False,
                 exclusions=None,
                 name=None):
        self.word_processor = KeywordProcessor(case_sensitive=case_sensitive)

        if exclusions is not None:
            substrings = list(set(substrings).difference(set(exclusions)))

        self.word_processor.add_keywords_from_list(substrings)

        name = self._format_name(name, [substrings, text_datasource])
        super().__init__(name, self.process, depends_on=[text_datasource])

    def process(self, text_or_texts):
        if text_or_texts is None:
            return []
        elif isinstance(text_or_texts, str):
            text = text_or_texts
            return self.word_processor.extract_keywords(text)
        else:
            texts = text_or_texts
            return [
                substring for text in texts
                for substring in self.word_processor.extract_keywords(text)
            ]
Exemplo n.º 28
0
def apply_keywords_tags(  # pylint: disable=R0913
    df_or_file: Union[str, pd.DataFrame],
    keywords: Iterable[str],
    source: Union[str, List[str]] = "text",
    case_sensitive: bool = True,
    span: bool = False,
    tag_suffix: str = TASK_SETTINGS["keywords"]["suffix"],
    file: Optional[str] = None,
) -> pd.DataFrame:
    """Inserts keyword column to given dataframe which is of type: List[str] of the keywords
    found in the [source] column

    Args:
        df_or_file: Pandas dataframe or path to csv file with data
        keywords: Iterable of strings to search for in text
        source: column name or names containing the source text
        case_sensitive: Toggle keyword case sensitivity
        span: If true, will also return the spans the keywords were found in the source
        tag_suffix: Name of new column will be formatted as [source][tag_suffix]
        file: If provided, the output will be saved to a given file path as csv

    Returns:
        Pandas dataframe of data with attached keywords column
    """
    data, source = _handle_data_and_source(df_or_file, source)

    # Get keyword processor and add keywords
    proc = KeywordProcessor(case_sensitive=case_sensitive)
    proc.add_keywords_from_list(keywords)

    # Extract keywords from all elements of each source column
    for col in source:
        data[f"{col}{tag_suffix}"] = data[col].progress_apply(
            lambda sent: proc.extract_keywords(sent, span_info=span))

    # If file is provided, save to file as well
    if file is not None:
        data.to_csv(file)
    return data
Exemplo n.º 29
0
def find_needles():
    needles = [
        "epclusa", "hcv", "chronic hepatitis c", "sofosbuvir+velpatasvir"
    ]

    start = time.time()
    processor = KeywordProcessor()
    processor.add_keywords_from_list(needles)
    print "Processed needles in", time.time() - start, "secs"

    start = time.time()
    file = open("hello.txt", "r")
    for haystack in file.readlines():
        found = list(
            set(processor.extract_keywords(haystack.lower(), span_info=True)))
        print found
        # string = ""
        # for ele in found:
        #     ele = ele.replace('"', '\\"')
        #     string += ',"' + ele + '"'
        # print string, "\n\n\n"
    file.close()
    time.time() - start
    print "Found needles within haystack in", time.time() - start, "secs"
Exemplo n.º 30
0
# input_vectorizer = CountVectorizer(input='content')
# input_vectorizer.fit(input_documents(['data/file1.txt', 'data/file2.txt']))

google_api_key = 'AIzaSyDzwz905JyFQmmpVlF6JkujslnjrId0J1M'
swj_key = 'AIzaSyCCGOwk_0HBW5uL91yno5jF-jODjcCB3Jg'
my_cse_id = "008517825388850444903:eyrvyy-n0i4"

# URL = "https://www.googleapis.com/customsearch/v1"
URL = 'https://www.googleapis.com/customsearch/v1/siterestrict?'

ENTITY_KEYWORD_IN_SUMMARY = [
    'district', 'District', 'province', 'Province', 'City', 'city', 'village',
    'Village', 'Country', 'country'
]
keyword_processor_summary = KeywordProcessor()
keyword_processor_summary.add_keywords_from_list(ENTITY_KEYWORD_IN_SUMMARY)

ENTITY_KEYWORD_IN_FREEBASE = [
    'location.', 'person.', 'organization.', ',government.', 'people'
]  #, 'people', 'asteroid'
keyword_processor_freebase = KeywordProcessor()
keyword_processor_freebase.add_keywords_from_list(ENTITY_KEYWORD_IN_FREEBASE)
langs = ['en']


def url2text(url):
    en_entity = None
    en_soup = None
    text = None
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
keyword_processor.add_keyword('Bay Area')
keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.')
print(keywords_found)
# ['Big Apple', 'Bay Area']

# 同时添加多个关键词
keyword_processor = KeywordProcessor()
# 字典格式的关键词,其对应的key为最终匹配出的词,但key不记入关键词搜索的范围
keyword_dict = {
    "java": ["java_2e", "java programing"],
    "product management": ["PM", "product manager"]
}
# {'clean_name': ['list of unclean names']}
keyword_processor.add_keywords_from_dict(keyword_dict)
# Or add keywords from a list:
keyword_processor.add_keywords_from_list(["java", "python"])
keyword_processor.extract_keywords('I am a product manager for a java_2e platform')
# output ['product management', 'java']

# 删除关键词
keyword_processor = KeywordProcessor()
keyword_dict = {
    "java": ["java_2e", "java programing"],
    "product management": ["PM", "product manager"]
}
keyword_processor.add_keywords_from_dict(keyword_dict)
print(keyword_processor.extract_keywords('I am a product manager for a java_2e platform'))
# output ['product management', 'java']
keyword_processor.remove_keyword('java_2e')
print(keyword_processor.extract_keywords('I am a product manager for a java_2e platform'))
# ['product management']